diff --git a/ERNIE/batching.py b/ERNIE/batching.py new file mode 100644 index 0000000000000000000000000000000000000000..618f66206965df45a2646ffae0d35c7bf83fb4e5 --- /dev/null +++ b/ERNIE/batching.py @@ -0,0 +1,210 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Mask, padding and batching.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): + """ + Add mask for batch_tokens, return out, mask_label, mask_pos; + Note: mask_pos responding the batch_tokens after padded; + """ + max_len = max([len(sent) for sent in batch_tokens]) + mask_label = [] + mask_pos = [] + prob_mask = np.random.rand(total_token_num) + # Note: the first token is [CLS], so [low=1] + replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) + pre_sent_len = 0 + prob_index = 0 + for sent_index, sent in enumerate(batch_tokens): + mask_flag = False + mask_word = mask_word_tags[sent_index] + prob_index += pre_sent_len + if mask_word: + beg = 0 + for token_index, token in enumerate(sent): + seg_label = seg_labels[sent_index][token_index] + if seg_label == 1: + continue + if beg == 0: + if seg_label != -1: + beg = token_index + continue + + prob = prob_mask[prob_index + beg] + if prob > 0.15: + pass + else: + for index in xrange(beg, token_index): + prob = prob_mask[prob_index + index] + base_prob = 1.0 + if index == beg: + base_prob = 0.15 + if base_prob * 0.2 < prob <= base_prob: + mask_label.append(sent[index]) + sent[index] = MASK + mask_flag = True + mask_pos.append(sent_index * max_len + index) + elif base_prob * 0.1 < prob <= base_prob * 0.2: + mask_label.append(sent[index]) + sent[index] = replace_ids[prob_index + index] + mask_flag = True + mask_pos.append(sent_index * max_len + index) + else: + mask_label.append(sent[index]) + mask_pos.append(sent_index * max_len + index) + + if seg_label == -1: + beg = 0 + else: + beg = token_index + else: + for token_index, token in enumerate(sent): + prob = prob_mask[prob_index + token_index] + if prob > 0.15: + continue + elif 0.03 < prob <= 0.15: + # mask + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = MASK + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + elif 0.015 < prob <= 0.03: + # random replace + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = replace_ids[prob_index + token_index] + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + else: + # keep the original token + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + mask_pos.append(sent_index * max_len + token_index) + + pre_sent_len = len(sent) + + mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) + mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) + return batch_tokens, mask_label, mask_pos + + +def prepare_batch_data(insts, + total_token_num, + voc_size=0, + pad_id=None, + cls_id=None, + sep_id=None, + mask_id=None, + return_attn_bias=True, + return_max_len=True, + return_num_token=False): + + batch_src_ids = [inst[0] for inst in insts] + batch_sent_ids = [inst[1] for inst in insts] + batch_pos_ids = [inst[2] for inst in insts] + labels = [inst[3] for inst in insts] + labels = np.array(labels).astype("int64").reshape([-1, 1]) + seg_labels = [inst[4] for inst in insts] + mask_word_tags = [inst[5] for inst in insts] + + # First step: do mask without padding + assert mask_id >= 0, "[FATAL] mask_id must >= 0" + out, mask_label, mask_pos = mask( + batch_src_ids, + seg_labels, + mask_word_tags, + total_token_num, + vocab_size=voc_size, + CLS=cls_id, + SEP=sep_id, + MASK=mask_id) + + # Second step: padding + src_id, next_sent_index, self_attn_bias = pad_batch_data( + out, pad_idx=pad_id, return_next_sent_pos=True, return_attn_bias=True) + pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) + sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) + + return_list = [src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, next_sent_index] + + return return_list + + +def pad_batch_data(insts, + pad_idx=0, + return_pos=False, + return_next_sent_pos=False, + return_attn_bias=False, + return_max_len=False, + return_num_token=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + + inst_data = np.array( + [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + + # next_sent_pos for extract first token embedding of each sentence + if return_next_sent_pos: + batch_size = inst_data.shape[0] + max_seq_len = inst_data.shape[1] + next_sent_index = np.array( + range(0, batch_size * max_seq_len, max_seq_len)).astype( + "int64").reshape(-1, 1) + return_list += [next_sent_index] + + # position data + if return_pos: + inst_pos = np.array([ + list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) + for inst in insts + ]) + + return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + + if return_attn_bias: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, max_len]), [1, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + + if return_max_len: + return_list += [max_len] + + if return_num_token: + num_token = 0 + for inst in insts: + num_token += len(inst) + return_list += [num_token] + + return return_list if len(return_list) > 1 else return_list[0] + + +if __name__ == "__main__": + pass diff --git a/ERNIE/finetune/__init__.py b/ERNIE/finetune/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ERNIE/finetune/classifier.py b/ERNIE/finetune/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..0e1f1f9d217b3a6eb6ed15f7fab6497b32446132 --- /dev/null +++ b/ERNIE/finetune/classifier.py @@ -0,0 +1,207 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model for classifier.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +import numpy as np + +import paddle.fluid as fluid + +from model.ernie import ErnieModel + + +def create_model(args, + pyreader_name, + ernie_config, + is_prediction=False): + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], [-1, 1]], + dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'], + lod_levels=[0, 0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + + (src_ids, sent_ids, pos_ids, self_attn_mask, labels, + next_sent_index, qids) = fluid.layers.read_file(pyreader) + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + self_attn_mask=self_attn_mask, + config=ernie_config, + use_fp16=args.use_fp16) + + cls_feats = ernie.get_pooled_output(next_sent_index) + cls_feats = fluid.layers.dropout( + x=cls_feats, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + logits = fluid.layers.fc( + input=cls_feats, + size=ernie_config["num_labels"], + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.))) + + if is_prediction: + probs = fluid.layers.softmax(logits) + feed_targets_name = [ + src_ids.name, pos_ids.name, sent_ids.name, self_attn_mask.name, + next_sent_index.name + ] + return pyreader, probs, feed_targets_name + + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=logits, label=labels, return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + + if args.use_fp16 and args.loss_scaling > 1.0: + loss *= args.loss_scaling + + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) + + graph_vars = {"loss": loss, + "probs": probs, + "accuracy": accuracy, + "labels": labels, + "num_seqs": num_seqs, + "qids": qids} + + for k, v in graph_vars.items(): + v.persistable=True + + return pyreader, graph_vars + +def evaluate_mrr(preds): + last_qid = None + total_mrr = 0.0 + qnum = 0.0 + rank = 0.0 + correct = False + for qid, score, label in preds: + if qid != last_qid: + rank = 0.0 + qnum += 1 + correct = False + last_qid = qid + + rank += 1 + if not correct and label != 0: + total_mrr += 1.0 / rank + correct = True + + return total_mrr / qnum + +def evaluate_map(preds): + def singe_map(st, en): + total_p = 0.0 + correct_num = 0.0 + for index in xrange(st, en): + if int(preds[index][2]) != 0: + correct_num += 1 + total_p += correct_num / (index - st + 1) + if int(correct_num) == 0: + return 0.0 + return total_p / correct_num + + last_qid = None + total_map = 0.0 + qnum = 0.0 + st = 0 + for i in xrange(len(preds)): + qid = preds[i][0] + if qid != last_qid: + qnum += 1 + if last_qid != None: + total_map += singe_map(st, i) + st = i + last_qid = qid + + total_map += singe_map(st, len(preds)) + return total_map / qnum + +def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): + train_fetch_list = [graph_vars["loss"].name, + graph_vars["accuracy"].name, + graph_vars["num_seqs"].name + ] + + if eval_phase == "train": + if "learning_rate" in graph_vars: + train_fetch_list.append(graph_vars["learning_rate"].name) + outputs = exe.run(fetch_list=train_fetch_list) + ret = {"loss":np.mean(outputs[0]), "accuracy":np.mean(outputs[1])} + if "learning_rate" in graph_vars: + ret["learning_rate"] = float(outputs[4][0]) + return ret + + test_pyreader.start() + total_cost, total_acc, total_num_seqs, total_label_pos_num, total_pred_pos_num, total_correct_num = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 + qids, labels, scores = [], [], [] + time_begin = time.time() + + fetch_list = [graph_vars["loss"].name, + graph_vars["accuracy"].name, + graph_vars["probs"].name, + graph_vars["labels"].name, + graph_vars["num_seqs"].name, + graph_vars["qids"].name] + while True: + try: + np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(program=test_program, + fetch_list=fetch_list) + total_cost += np.sum(np_loss * np_num_seqs) + total_acc += np.sum(np_acc * np_num_seqs) + total_num_seqs += np.sum(np_num_seqs) + labels.extend(np_labels.reshape((-1)).tolist()) + qids.extend(np_qids.reshape(-1).tolist()) + scores.extend(np_probs[:,1].reshape(-1).tolist()) + np_preds = np.argmax(np_probs, axis=1).astype(np.float32) + total_label_pos_num += np.sum(np_labels) + total_pred_pos_num += np.sum(np_preds) + total_correct_num += np.sum(np.dot(np_preds, np_labels)) + except fluid.core.EOFException: + test_pyreader.reset() + break + time_end = time.time() + + if len(qids) == 0: + print("[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" % + (eval_phase, total_cost / total_num_seqs, + total_acc / total_num_seqs, total_num_seqs, time_end - time_begin)) + else: + r = total_correct_num / total_label_pos_num + p = total_correct_num / total_pred_pos_num + f = 2 * p * r / (p + r) + + assert len(qids) == len(labels) == len(scores) + preds = sorted(zip(qids, scores, labels), key=lambda elem:(elem[0], -elem[1])) + mrr = evaluate_mrr(preds) + map = evaluate_map(preds) + + print("[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" % + (eval_phase, total_cost / total_num_seqs, + total_acc / total_num_seqs, + mrr, map, p, r, f, total_num_seqs, time_end - time_begin)) diff --git a/ERNIE/finetune/sequence_label.py b/ERNIE/finetune/sequence_label.py new file mode 100644 index 0000000000000000000000000000000000000000..327c9e56f0d2ae6f10718a88bce440b79b63dc18 --- /dev/null +++ b/ERNIE/finetune/sequence_label.py @@ -0,0 +1,224 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as fluid + +from model.ernie import ErnieModel + +def create_model(args, + pyreader_name, + ernie_config, + is_prediction=False): + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, args.max_seq_len], + [-1, args.max_seq_len, 1], [-1, 1]], + dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64'], + lod_levels=[0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + + (src_ids, sent_ids, pos_ids, self_attn_mask, labels, + seq_lens) = fluid.layers.read_file(pyreader) + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + self_attn_mask=self_attn_mask, + config=ernie_config, + use_fp16=args.use_fp16) + + enc_out = ernie.get_sequence_output() + logits = fluid.layers.fc( + input=enc_out, + size=args.num_labels, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_seq_label_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.))) + + ret_labels = fluid.layers.reshape(x=labels, shape=[-1,1]) + ret_infers = fluid.layers.reshape(x=fluid.layers.argmax(logits, axis=2), shape=[-1,1]) + + labels = fluid.layers.flatten(labels, axis=2) + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=fluid.layers.flatten(logits, axis=2), + label=labels, return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + + if args.use_fp16 and args.loss_scaling > 1.0: + loss *= args.loss_scaling + + graph_vars = {"loss": loss, + "probs": probs, + "labels": ret_labels, + "infers": ret_infers, + "seq_lens": seq_lens} + + for k, v in graph_vars.items(): + v.persistable=True + + return pyreader, graph_vars + +def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1): + + def extract_bio_chunk(seq): + chunks = [] + cur_chunk = None + null_index = tag_num - 1 + for index in xrange(len(seq)): + tag = seq[index] + tag_type = tag // 2 + tag_pos = tag % 2 + + if tag == null_index: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = None + continue + + if tag_pos == 0: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = {} + cur_chunk = {"st":index, "en": index + 1, "type": tag_type} + + else: + if cur_chunk is None: + cur_chunk = {"st":index, "en": index + 1, "type": tag_type} + continue + + if cur_chunk["type"] == tag_type: + cur_chunk["en"] = index + 1 + else: + chunks.append(cur_chunk) + cur_chunk = {"st":index, "en": index + 1, "type": tag_type} + + if cur_chunk is not None: + chunks.append(cur_chunk) + return chunks + + null_index = tag_num - 1 + num_label = 0 + num_infer = 0 + num_correct = 0 + labels = np_labels.reshape([-1]).astype(np.int32).tolist() + infers = np_infers.reshape([-1]).astype(np.int32).tolist() + all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist() + + base_index = 0 + for dev_index in xrange(dev_count): + lens = all_lens[dev_index] + max_len = 0 + for l in lens: + max_len = max(max_len, l) + + for i in xrange(len(lens)): + seq_st = base_index + i * max_len + 1 + seq_en = seq_st + (lens[i] - 2) + infer_chunks = extract_bio_chunk(infers[seq_st:seq_en]) + label_chunks = extract_bio_chunk(labels[seq_st:seq_en]) + num_infer += len(infer_chunks) + num_label += len(label_chunks) + + infer_index = 0 + label_index = 0 + while label_index < len(label_chunks) and infer_index < len(infer_chunks): + if infer_chunks[infer_index]["st"] < label_chunks[label_index]["st"]: + infer_index += 1 + elif infer_chunks[infer_index]["st"] > label_chunks[label_index]["st"]: + label_index += 1 + else: + if infer_chunks[infer_index]["en"] == label_chunks[label_index]["en"] and \ + infer_chunks[infer_index]["type"] == label_chunks[label_index]["type"]: + num_correct += 1 + + infer_index += 1 + label_index += 1 + + base_index += max_len * len(lens) + + return num_label, num_infer, num_correct + +def calculate_f1(num_label, num_infer, num_correct): + if num_infer == 0: + precision = 0.0 + else: + precision = num_correct * 1.0 / num_infer + + if num_label == 0: + recall = 0.0 + else: + recall = num_correct * 1.0 / num_label + + if num_correct == 0: + f1 = 0.0 + else: + f1 = 2 * precision * recall / (precision + recall) + return precision, recall, f1 + +def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=1): + fetch_list = [graph_vars["labels"].name, + graph_vars["infers"].name, + graph_vars["seq_lens"].name] + + if eval_phase == "train": + fetch_list.append(graph_vars["loss"].name) + if "learning_rate" in graph_vars: + fetch_list.append(graph_vars["learning_rate"].name) + outputs = exe.run(fetch_list=fetch_list) + np_labels, np_infers, np_lens, np_loss = outputs[:4] + num_label, num_infer, num_correct = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count) + precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct) + outputs = {"precision": precision, "recall": recall, "f1": f1, "loss": np.mean(np_loss)} + if "learning_rate" in graph_vars: + outputs["lr"] = float(outputs[4][0]) + return outputs + + else: + total_label, total_infer, total_correct = 0.0, 0.0, 0.0 + time_begin = time.time() + pyreader.start() + while True: + try: + np_labels, np_infers, np_lens = exe.run(program=program, fetch_list=fetch_list) + label_num, infer_num, correct_num = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count) + total_infer += infer_num + total_label += label_num + total_correct += correct_num + + except fluid.core.EOFException: + pyreader.reset() + break + + precision, recall, f1 = calculate_f1(total_label, total_infer, total_correct) + time_end = time.time() + + print("[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s" % + (eval_phase, f1, precision, recall, time_end - time_begin)) diff --git a/ERNIE/finetune_args.py b/ERNIE/finetune_args.py new file mode 100644 index 0000000000000000000000000000000000000000..1f7d9349b8087f57addfbdf9cf8b1bf48156d65e --- /dev/null +++ b/ERNIE/finetune_args.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse + +from utils.args import ArgumentGroup + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +model_g = ArgumentGroup(parser, "model", "model configuration and paths.") +model_g.add_arg("ernie_config_path", str, None, "Path to the json file for ernie model config.") +model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") +model_g.add_arg("init_pretraining_params", str, None, + "Init pre-training params which preforms fine-tuning from. If the " + "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") +model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") + +train_g = ArgumentGroup(parser, "training", "training options.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") +train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", + "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("warmup_proportion", float, 0.1, + "Proportion of training steps to perform linear learning rate warmup for.") +train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") +train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") +train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") +train_g.add_arg("loss_scaling", float, 1.0, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") + +log_g = ArgumentGroup(parser, "logging", "logging related.") +log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") +log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") + +data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") +data_g.add_arg("train_set", str, None, "Path to training data.") +data_g.add_arg("test_set", str, None, "Path to test data.") +data_g.add_arg("dev_set", str, None, "Path to validation data.") +data_g.add_arg("vocab_path", str, None, "Vocabulary path.") +data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("in_tokens", bool, False, + "If set, the batch size will be the maximum number of tokens in one batch. " + "Otherwise, it will be the maximum number of examples in one batch.") +data_g.add_arg("do_lower_case", bool, True, + "Whether to lower case the input text. Should be True for uncased models and False for cased models.") +data_g.add_arg("random_seed", int, 0, "Random seed.") +data_g.add_arg("label_map_config", str, None, "label_map_path.") +data_g.add_arg("num_labels", int, 2, "label number") + +run_type_g = ArgumentGroup(parser, "run_type", "running type options.") +run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") +run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") +run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.") +run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") +run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") +run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") +run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") diff --git a/ERNIE/model/__init__.py b/ERNIE/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ERNIE/model/ernie.py b/ERNIE/model/ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..69d6e112eb97855e933800b57d5eeb560caf2257 --- /dev/null +++ b/ERNIE/model/ernie.py @@ -0,0 +1,225 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ernie model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import six +import json +import numpy as np +import paddle.fluid as fluid +from model.transformer_encoder import encoder, pre_process_layer + + +class ErnieConfig(object): + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path) as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing Ernie model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def print_config(self): + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +class ErnieModel(object): + def __init__(self, + src_ids, + position_ids, + sentence_ids, + self_attn_mask, + config, + weight_sharing=True, + use_fp16=False): + + self._emb_size = config['hidden_size'] + self._n_layer = config['num_hidden_layers'] + self._n_head = config['num_attention_heads'] + self._voc_size = config['vocab_size'] + self._max_position_seq_len = config['max_position_embeddings'] + self._sent_types = config['type_vocab_size'] + self._hidden_act = config['hidden_act'] + self._prepostprocess_dropout = config['hidden_dropout_prob'] + self._attention_dropout = config['attention_probs_dropout_prob'] + self._weight_sharing = weight_sharing + + self._word_emb_name = "word_embedding" + self._pos_emb_name = "pos_embedding" + self._sent_emb_name = "sent_embedding" + self._dtype = "float16" if use_fp16 else "float32" + + # Initialize all weigths by truncated normal initializer, and all biases + # will be initialized by constant zero by default. + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=config['initializer_range']) + + self._build_model(src_ids, position_ids, sentence_ids, self_attn_mask) + + def _build_model(self, src_ids, position_ids, sentence_ids, self_attn_mask): + # padding id in vocabulary must be set to 0 + emb_out = fluid.layers.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + position_emb_out = fluid.layers.embedding( + input=position_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.layers.embedding( + sentence_ids, + size=[self._sent_types, self._emb_size], + dtype=self._dtype, + param_attr=fluid.ParamAttr( + name=self._sent_emb_name, initializer=self._param_initializer)) + + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + emb_out = pre_process_layer( + emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') + + if self._dtype is "float16": + self_attn_mask = fluid.layers.cast( + x=self_attn_mask, dtype=self._dtype) + + n_head_self_attn_mask = fluid.layers.stack( + x=[self_attn_mask] * self._n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + self._enc_out = encoder( + enc_input=emb_out, + attn_bias=n_head_self_attn_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd="", + postprocess_cmd="dan", + param_initializer=self._param_initializer, + name='encoder') + + def get_sequence_output(self): + return self._enc_out + + def get_pooled_output(self, next_sent_index): + """Get the first feature of each sequence for classification""" + self._reshaped_emb_out = fluid.layers.reshape( + x=self._enc_out, shape=[-1, self._emb_size], inplace=True) + next_sent_index = fluid.layers.cast(x=next_sent_index, dtype='int32') + next_sent_feat = fluid.layers.gather( + input=self._reshaped_emb_out, index=next_sent_index) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name="pooled_fc.w_0", initializer=self._param_initializer), + bias_attr="pooled_fc.b_0") + return next_sent_feat + + def get_pretraining_output(self, mask_label, mask_pos, labels, + next_sent_index): + """Get the loss & accuracy for pretraining""" + + mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') + + # extract the first token feature in each sentence + next_sent_feat = self.get_pooled_output(next_sent_index) + # extract masked tokens' feature + mask_feat = fluid.layers.gather( + input=self._reshaped_emb_out, index=mask_pos) + + # transform: fc + mask_trans_feat = fluid.layers.fc( + input=mask_feat, + size=self._emb_size, + act=self._hidden_act, + param_attr=fluid.ParamAttr( + name='mask_lm_trans_fc.w_0', + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) + # transform: layer norm + mask_trans_feat = pre_process_layer( + mask_trans_feat, 'n', name='mask_lm_trans') + + mask_lm_out_bias_attr = fluid.ParamAttr( + name="mask_lm_out_fc.b_0", + initializer=fluid.initializer.Constant(value=0.0)) + if self._weight_sharing: + fc_out = fluid.layers.matmul( + x=mask_trans_feat, + y=fluid.default_main_program().global_block().var( + self._word_emb_name), + transpose_y=True) + fc_out += fluid.layers.create_parameter( + shape=[self._voc_size], + dtype=self._dtype, + attr=mask_lm_out_bias_attr, + is_bias=True) + + else: + fc_out = fluid.layers.fc(input=mask_trans_feat, + size=self._voc_size, + param_attr=fluid.ParamAttr( + name="mask_lm_out_fc.w_0", + initializer=self._param_initializer), + bias_attr=mask_lm_out_bias_attr) + + mask_lm_loss = fluid.layers.softmax_with_cross_entropy( + logits=fc_out, label=mask_label) + mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) + + next_sent_fc_out = fluid.layers.fc( + input=next_sent_feat, + size=2, + param_attr=fluid.ParamAttr( + name="next_sent_fc.w_0", initializer=self._param_initializer), + bias_attr="next_sent_fc.b_0") + + next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( + logits=next_sent_fc_out, label=labels, return_softmax=True) + + next_sent_acc = fluid.layers.accuracy( + input=next_sent_softmax, label=labels) + + mean_next_sent_loss = fluid.layers.mean(next_sent_loss) + + loss = mean_next_sent_loss + mean_mask_lm_loss + return next_sent_acc, mean_mask_lm_loss, loss diff --git a/ERNIE/model/transformer_encoder.py b/ERNIE/model/transformer_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..93a77ebe480f0e4a8e2b4f2c0c18b23383075fb7 --- /dev/null +++ b/ERNIE/model/transformer_encoder.py @@ -0,0 +1,342 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformer encoder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from functools import partial +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + param_initializer=None, + name='multi_head_att'): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_query_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_query_fc.b_0') + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_key_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_key_fc.b_0') + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_value_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_value_fc.b_0') + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_key**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + if cache is not None: # use cache and concat time steps + # Since the inplace reshape in __split_heads changes the shape of k and + # v, which is the cache input for next time step, reshape the cache + # input from the previous time step first. + k = cache["k"] = layers.concat( + [layers.reshape( + cache["k"], shape=[0, 0, d_model]), k], axis=1) + v = cache["v"] = layers.concat( + [layers.reshape( + cache["v"], shape=[0, 0, d_model]), v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_output_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_output_fc.b_0') + return proj_out + + +def positionwise_feed_forward(x, + d_inner_hid, + d_hid, + dropout_rate, + hidden_act, + param_initializer=None, + name='ffn'): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act=hidden_act, + param_attr=fluid.ParamAttr( + name=name + '_fc_0.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_0.b_0') + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_fc_1.w_0', initializer=param_initializer), + bias_attr=name + '_fc_1.b_0') + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., + name=''): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out_dtype = out.dtype + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float32") + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=name + '_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float16") + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention( + pre_process_layer( + enc_input, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_att'), + None, + None, + attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + param_initializer=param_initializer, + name=name + '_multi_head_att') + attn_output = post_process_layer( + enc_input, + attn_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_att') + ffd_output = positionwise_feed_forward( + pre_process_layer( + attn_output, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_ffn'), + d_inner_hid, + d_model, + relu_dropout, + hidden_act, + param_initializer=param_initializer, + name=name + '_ffn') + return post_process_layer( + attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_ffn') + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") + + return enc_output diff --git a/ERNIE/optimization.py b/ERNIE/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..e010bca9cdd57dd90e34961f2b1102888627d27a --- /dev/null +++ b/ERNIE/optimization.py @@ -0,0 +1,139 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimization and learning rate scheduling.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle.fluid as fluid +from utils.fp16 import create_master_params_grads, master_param_to_train_param + + +def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): + """ Applies linear warmup of learning rate from 0 and decay to 0.""" + with fluid.default_main_program()._lr_schedule_guard(): + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="scheduled_learning_rate") + + global_step = fluid.layers.learning_rate_scheduler._decay_step_counter() + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_steps): + warmup_lr = learning_rate * (global_step / warmup_steps) + fluid.layers.tensor.assign(warmup_lr, lr) + with switch.default(): + decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay( + learning_rate=learning_rate, + decay_steps=num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + fluid.layers.tensor.assign(decayed_lr, lr) + + return lr + + +def optimization(loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + startup_prog, + weight_decay, + scheduler='linear_warmup_decay', + use_fp16=False, + loss_scaling=1.0): + if warmup_steps > 0: + if scheduler == 'noam_decay': + scheduled_lr = fluid.layers.learning_rate_scheduler\ + .noam_decay(1/(warmup_steps *(learning_rate ** 2)), + warmup_steps) + elif scheduler == 'linear_warmup_decay': + scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, + num_train_steps) + else: + raise ValueError("Unkown learning rate scheduler, should be " + "'noam_decay' or 'linear_warmup_decay'") + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + else: + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + scheduled_lr = learning_rate + + clip_norm_thres = 1.0 + # When using mixed precision training, scale the gradient clip threshold + # by loss_scaling + if use_fp16 and loss_scaling > 1.0: + clip_norm_thres *= loss_scaling + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) + + def exclude_from_weight_decay(name): + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + param_list = dict() + + if use_fp16: + param_grads = optimizer.backward(loss) + master_param_grads = create_master_params_grads( + param_grads, train_program, startup_prog, loss_scaling) + + for param, _ in master_param_grads: + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + optimizer.apply_gradients(master_param_grads) + + if weight_decay > 0: + for param, grad in master_param_grads: + if exclude_from_weight_decay(param.name.rstrip(".master")): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * weight_decay * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + master_param_to_train_param(master_param_grads, param_grads, + train_program) + + else: + for param in train_program.global_block().all_parameters(): + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + _, param_grads = optimizer.minimize(loss) + + if weight_decay > 0: + for param, grad in param_grads: + if exclude_from_weight_decay(param.name): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * weight_decay * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + return scheduled_lr diff --git a/ERNIE/pretrain_args.py b/ERNIE/pretrain_args.py new file mode 100644 index 0000000000000000000000000000000000000000..2543a346ff7cab114a9418978c2e5dfc5b018a5a --- /dev/null +++ b/ERNIE/pretrain_args.py @@ -0,0 +1,69 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse + +from utils.args import ArgumentGroup, print_arguments + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser = argparse.ArgumentParser(__doc__) +model_g = ArgumentGroup(parser, "model", "model configuration and paths.") +model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.") +model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") +model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") +model_g.add_arg("weight_sharing", bool, True, "If set, share weights between word embedding and masked lm.") +model_g.add_arg("generate_neg_sample", bool, False, "If set, randomly generate negtive samples by positive samples.") + +train_g = ArgumentGroup(parser, "training", "training options.") +train_g.add_arg("epoch", int, 100, "Number of epoches for training.") +train_g.add_arg("learning_rate", float, 0.0001, "Learning rate used to train with warmup.") +train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", + "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("num_train_steps", int, 1000000, "Total steps to perform pretraining.") +train_g.add_arg("warmup_steps", int, 4000, "Total steps to perform warmup when pretraining.") +train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") +train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") +train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") +train_g.add_arg("loss_scaling", float, 1.0, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") + +log_g = ArgumentGroup(parser, "logging", "logging related.") +log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") +log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") + +data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") +data_g.add_arg("train_filelist", str, "", "Path to training filelist.") +data_g.add_arg("valid_filelist", str, "", "Path to valid filelist.") +data_g.add_arg("test_filelist", str, "", "Path to test filelist.") +data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.") +data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") +data_g.add_arg("batch_size", int, 16, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("in_tokens", bool, False, + "If set, the batch size will be the maximum number of tokens in one batch. " + "Otherwise, it will be the maximum number of examples in one batch.") + +run_type_g = ArgumentGroup(parser, "run_type", "running type options.") +run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.") +run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") +run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") +run_type_g.add_arg("do_test", bool, False, "Whether to perform evaluation on test data set.") +# yapf: enable diff --git a/ERNIE/reader/__init__.py b/ERNIE/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ERNIE/reader/pretraining.py b/ERNIE/reader/pretraining.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c3d883ec66571b910e34bc3963db6e6778e2d2 --- /dev/null +++ b/ERNIE/reader/pretraining.py @@ -0,0 +1,297 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from __future__ import division + +import os +import numpy as np +import types +import gzip +import logging +import re +import six +import collections +import tokenization + +import paddle +import paddle.fluid as fluid + +from batching import prepare_batch_data + +class ErnieDataReader(object): + def __init__(self, + filelist, + vocab_path, + batch_size=4096, + max_seq_len=512, + shuffle_files=True, + epoch=100, + voc_size=0, + is_test=False, + generate_neg_sample=False): + + self.vocab = self.load_vocab(vocab_path) + self.filelist = filelist + self.batch_size = batch_size + self.shuffle_files = shuffle_files + self.epoch = epoch + self.current_epoch = 0 + self.current_file_index = 0 + self.total_file = 0 + self.current_file = None + self.voc_size = voc_size + self.max_seq_len = max_seq_len + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.mask_id = self.vocab["[MASK]"] + self.is_test = is_test + self.generate_neg_sample = generate_neg_sample + assert self.batch_size > 100, "Current batch size means total token's number, \ + it should not be set to too small number." + + if self.is_test: + self.epoch = 1 + self.shuffle_files = False + + def get_progress(self): + """return current progress of traning data + """ + return self.current_epoch, self.current_file_index, self.total_file, self.current_file, self.mask_type + + def parse_line(self, line, max_seq_len=512): + """ parse one line to token_ids, sentence_ids, pos_ids, label + """ + line = line.strip().split(";") + assert len(line) == 5, "One sample must have 5 fields!" + (token_ids, sent_ids, pos_ids, seg_labels, label) = line + token_ids = [int(token) for token in token_ids.split(" ")] + sent_ids = [int(token) for token in sent_ids.split(" ")] + pos_ids = [int(token) for token in pos_ids.split(" ")] + seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")] + assert len(token_ids) == len(sent_ids) == len( + pos_ids) == len(seg_labels + ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" + label = int(label) + if len(token_ids) > max_seq_len: + return None + return [token_ids, sent_ids, pos_ids, label, seg_labels] + + def read_file(self, file): + assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file + with gzip.open(file, "rb") as f: + for line in f: + parsed_line = self.parse_line( + line, max_seq_len=self.max_seq_len) + if parsed_line is None: + continue + yield parsed_line + + def convert_to_unicode(self, text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + def load_vocab(self, vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file) + for num, line in enumerate(fin): + items = self.convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + def random_pair_neg_samples(self, pos_samples): + """ randomly generate negtive samples using pos_samples + + Args: + pos_samples: list of positive samples + + Returns: + neg_samples: list of negtive samples + """ + np.random.shuffle(pos_samples) + num_sample = len(pos_samples) + neg_samples = [] + miss_num = 0 + + def split_sent(sample, max_len, sep_id): + token_seq, type_seq, pos_seq, label, seg_labels = sample + sep_index = token_seq.index(sep_id) + left_len = sep_index - 1 + if left_len <= max_len: + return (token_seq[1:sep_index], seg_labels[1:sep_index]) + else: + return [token_seq[sep_index + 1: -1], seg_labels[sep_index + 1 : -1]] + + for i in range(num_sample): + pair_index = (i + 1) % num_sample + left_tokens, left_seg_labels = split_sent(pos_samples[i], + (self.max_seq_len - 3) // 2, self.sep_id) + right_tokens, right_seg_labels = split_sent(pos_samples[pair_index], + self.max_seq_len - 3 - len(left_tokens), self.sep_id) + + token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \ + right_tokens + [self.sep_id] + if len(token_seq) > self.max_seq_len: + miss_num += 1 + continue + type_seq = [0] * (len(left_tokens) + 2) + [1] * (len(right_tokens) + 1) + pos_seq = range(len(token_seq)) + seg_label_seq = [-1] + left_seg_labels + [-1] + right_seg_labels + [-1] + + assert len(token_seq) == len(type_seq) == len(pos_seq) == len(seg_label_seq), \ + "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True" + neg_samples.append([token_seq, type_seq, pos_seq, 0, seg_label_seq]) + + return neg_samples, miss_num + + def mixin_negtive_samples(self, pos_sample_generator, buffer=1000): + """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples + 2. combine negtive samples and positive samples + + Args: + pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1] + + Returns: + sample: one sample from shuffled positive samples and negtive samples + """ + pos_samples = [] + num_total_miss = 0 + pos_sample_num = 0 + try: + while True: + while len(pos_samples) < buffer: + pos_sample = next(pos_sample_generator) + label = pos_sample[3] + assert label == 1, "positive sample's label must be 1" + pos_samples.append(pos_sample) + pos_sample_num += 1 + + neg_samples, miss_num = self.random_pair_neg_samples( + pos_samples) + num_total_miss += miss_num + samples = pos_samples + neg_samples + pos_samples = [] + np.random.shuffle(samples) + for sample in samples: + yield sample + except StopIteration: + print("stopiteration: reach end of file") + if len(pos_samples) == 1: + yield pos_samples[0] + elif len(pos_samples) == 0: + yield None + else: + neg_samples, miss_num = self.random_pair_neg_samples( + pos_samples) + num_total_miss += miss_num + samples = pos_samples + neg_samples + pos_samples = [] + np.random.shuffle(samples) + for sample in samples: + yield sample + print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" % + (num_total_miss, pos_sample_num * 2, + num_total_miss / (pos_sample_num * 2))) + + def data_generator(self): + """ + data_generator + """ + files = open(self.filelist).readlines() + self.total_file = len(files) + assert self.total_file > 0, "[Error] data_dir is empty" + + def wrapper(): + def reader(): + for epoch in range(self.epoch): + self.current_epoch = epoch + 1 + if self.shuffle_files: + np.random.shuffle(files) + for index, file in enumerate(files): + file, mask_word_prob = file.strip().split("\t") + mask_word = (np.random.random() < float(mask_word_prob)) + self.current_file_index = index + 1 + self.current_file = file + if mask_word: + self.mask_type = "mask_word" + else: + self.mask_type = "mask_char" + + sample_generator = self.read_file(file) + if not self.is_test and self.generate_neg_sample: + sample_generator = self.mixin_negtive_samples( + sample_generator) + for sample in sample_generator: + if sample is None: + continue + sample.append(mask_word) + yield sample + + def batch_reader(reader, batch_size): + batch, total_token_num, max_len = [], 0, 0 + for parsed_line in reader(): + token_ids, sent_ids, pos_ids, label, seg_labels, mask_word = parsed_line + max_len = max(max_len, len(token_ids)) + if (len(batch) + 1) * max_len <= batch_size: + batch.append(parsed_line) + total_token_num += len(token_ids) + else: + yield batch, total_token_num + batch, total_token_num, max_len = [parsed_line], len( + token_ids), len(token_ids) + + if len(batch) > 0: + yield batch, total_token_num + + for batch_data, total_token_num in batch_reader(reader, + self.batch_size): + yield prepare_batch_data( + batch_data, + total_token_num, + voc_size=self.voc_size, + pad_id=self.pad_id, + cls_id=self.cls_id, + sep_id=self.sep_id, + mask_id=self.mask_id, + return_attn_bias=True, + return_max_len=False, + return_num_token=False) + + return wrapper + + +if __name__ == "__main__": + pass diff --git a/ERNIE/reader/task_reader.py b/ERNIE/reader/task_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..28aacbdbfc9c544a170f6a969605eb464112d335 --- /dev/null +++ b/ERNIE/reader/task_reader.py @@ -0,0 +1,313 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import csv +import json +import numpy as np +from collections import namedtuple + +import tokenization +from batching import pad_batch_data + + +class BaseReader(object): + def __init__(self, + vocab_path, + label_map_config=None, + max_seq_len=512, + do_lower_case=True, + in_tokens=False, + random_seed=None): + self.max_seq_len = max_seq_len + self.tokenizer = tokenization.FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.in_tokens = in_tokens + + np.random.seed(random_seed) + + self.current_example = 0 + self.current_epoch = 0 + self.num_examples = 0 + + if label_map_config: + with open(label_map_config) as f: + self.label_map = json.load(f) + else: + self.label_map = None + pass + + def get_train_progress(self): + """Gets progress for training phase.""" + return self.current_example, self.current_epoch + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + headers = next(reader) + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + example = Example(*line) + examples.append(example) + return examples + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + """Converts a single `Example` into a single `Record`.""" + + text_a = tokenization.convert_to_unicode(example.text_a) + tokens_a = tokenizer.tokenize(text_a) + tokens_b = None + if "text_b" in example._fields: + text_b = tokenization.convert_to_unicode(example.text_b) + tokens_b = tokenizer.tokenize(text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT/ERNIE is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + text_type_ids.append(1) + tokens.append("[SEP]") + text_type_ids.append(1) + + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.label + + Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) + + qid = None + if "qid" in example._fields: + qid = example.qid + + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_id=label_id, + qid=qid) + return record + + def _prepare_batch_data(self, examples, batch_size, phase=None): + """generate batch records""" + batch_records, max_len = [], 0 + for index, example in enumerate(examples): + if phase == "train": + self.current_example = index + record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer) + max_len = max(max_len, len(record.token_ids)) + if self.in_tokens: + to_append = (len(batch_records) + 1) * max_len <= batch_size + else: + to_append = len(batch_records) < batch_size + if to_append: + batch_records.append(record) + else: + yield self._pad_batch_records(batch_records) + batch_records, max_len = [record], len(record.token_ids) + + if len(batch_records) > 0: + yield self._pad_batch_records(batch_records) + + def get_num_examples(self, input_file): + examples = self._read_tsv(input_file) + return len(examples) + + def data_generator(self, input_file, batch_size, epoch, shuffle=True, phase=None): + examples = self._read_tsv(input_file) + + def wrapper(): + for epoch_index in range(epoch): + if phase == "train": + self.current_example = 0 + self.current_epoch = epoch_index + if shuffle: + np.random.shuffle(examples) + + for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase): + yield batch_data + return wrapper + + +class ClassifyReader(BaseReader): + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + headers = next(reader) + text_indices = [index for index, h in enumerate(headers) if h != "label"] + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + for index, text in enumerate(line): + if index in text_indices: + line[index] = text.replace(' ', '') + example = Example(*line) + examples.append(example) + return examples + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_labels = [record.label_id for record in batch_records] + batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) + + if batch_records[0].qid: + batch_qids = [record.qid for record in batch_records] + batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1]) + else: + batch_qids = np.array([]).astype("int64").reshape([-1, 1]) + + # padding + padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True) + padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) + + return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids] + + return return_list + + +class SequenceLabelReader(BaseReader): + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_label_ids = [record.label_ids for record in batch_records] + batch_seq_lens = [len(record.token_ids) for record in batch_records] + + # padding + padded_token_ids, self_attn_bias = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True) + padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) + padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map)-1) + batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape([-1, 1]) + + return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens] + return return_list + + def _reseg_token_label(self, tokens, labels, tokenizer): + assert len(tokens) == len(labels) + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + assert len(ret_tokens) == len(ret_labels) + return ret_tokens, ret_labels + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + tokens = tokenization.convert_to_unicode(example.text_a).split(u"") + labels = tokenization.convert_to_unicode(example.label).split(u"") + tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) + + if len(tokens) > max_seq_length - 2: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens + ["[SEP]"] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + text_type_ids = [0] * len(token_ids) + no_entity_id = len(self.label_map) - 1 + label_ids = [no_entity_id] + [self.label_map[label] for label in labels] + [no_entity_id] + + Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_ids=label_ids) + return record + +if __name__ == '__main__': + pass diff --git a/ERNIE/run_classifier.py b/ERNIE/run_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..a891a37f9a3dadb089d8d07644c98a90ef154378 --- /dev/null +++ b/ERNIE/run_classifier.py @@ -0,0 +1,276 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as fluid + +import reader.task_reader as task_reader +from model.ernie import ErnieConfig +from finetune.classifier import create_model, evaluate +from optimization import optimization +from utils.args import ArgumentGroup, print_arguments +from utils.init import init_pretraining_params, init_checkpoint +from finetune_args import parser + + +args = parser.parse_args() + +def main(args): + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exe = fluid.Executor(place) + + reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + random_seed=args.random_seed) + + if not (args.do_train or args.do_val or args.do_test): + raise ValueError("For args `do_train`, `do_val` and `do_test`, at " + "least one of them must be True.") + + startup_prog = fluid.Program() + if args.random_seed is not None: + startup_prog.random_seed = args.random_seed + + if args.do_train: + train_data_generator = reader.data_generator( + input_file=args.train_set, + batch_size=args.batch_size, + epoch=args.epoch, + shuffle=True, + phase="train") + + num_train_examples = reader.get_num_examples(args.train_set) + + if args.in_tokens: + max_train_steps = args.epoch * num_train_examples // ( + args.batch_size // args.max_seq_len) // dev_count + else: + max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count + + warmup_steps = int(max_train_steps * args.warmup_proportion) + print("Device count: %d" % dev_count) + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + print("Num warmup steps: %d" % warmup_steps) + + train_program = fluid.Program() + + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, graph_vars = create_model( + args, + pyreader_name='train_reader', + ernie_config=ernie_config) + scheduled_lr = optimization( + loss=graph_vars["loss"], + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + use_fp16=args.use_fp16, + loss_scaling=args.loss_scaling) + + fluid.memory_optimize( + input_program=train_program, + skip_opt_set=[graph_vars["loss"].name, + graph_vars["probs"].name, + graph_vars["accuracy"].name, + graph_vars["num_seqs"].name, + ]) + + if args.verbose: + if args.in_tokens: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, + batch_size=args.batch_size // args.max_seq_len) + else: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, batch_size=args.batch_size) + print("Theoretical memory usage in training: %.3f - %.3f %s" % + (lower_mem, upper_mem, unit)) + + if args.do_val or args.do_test: + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, graph_vars = create_model( + args, + pyreader_name='test_reader', + ernie_config=ernie_config) + + test_prog = test_prog.clone(for_test=True) + + exe.run(startup_prog) + + if args.do_train: + if args.init_checkpoint and args.init_pretraining_params: + print( + "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " + "both are set! Only arg 'init_checkpoint' is made valid.") + if args.init_checkpoint: + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.init_pretraining_params: + init_pretraining_params( + exe, + args.init_pretraining_params, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.do_val or args.do_test: + if not args.init_checkpoint: + raise ValueError("args 'init_checkpoint' should be set if" + "only doing validation or testing!") + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + + if args.do_train: + exec_strategy = fluid.ExecutionStrategy() + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = dev_count + exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope + + train_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=graph_vars["loss"].name, + exec_strategy=exec_strategy, + main_program=train_program) + + train_pyreader.decorate_tensor_provider(train_data_generator) + else: + train_exe = None + + if args.do_val or args.do_test: + test_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + main_program=test_prog, + share_vars_from=train_exe) + + if args.do_train: + train_pyreader.start() + steps = 0 + if warmup_steps > 0: + graph_vars["learning_rate"] = scheduled_lr + + time_begin = time.time() + while True: + try: + steps += 1 + if steps % args.skip_steps != 0: + train_exe.run(fetch_list=[]) + else: + outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train") + + if args.verbose: + verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + ) + verbose += "learning rate: %f" % ( + outputs["learning_rate"] + if warmup_steps > 0 else args.learning_rate) + print(verbose) + + current_example, current_epoch = reader.get_train_progress() + time_end = time.time() + used_time = time_end - time_begin + print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " + "ave acc: %f, speed: %f steps/s" % + (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) + time_begin = time.time() + + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, + "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + + if steps % args.validation_steps == 0: + # evaluate dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, "dev") + # evaluate test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, "test") + except fluid.core.EOFException: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + train_pyreader.reset() + break + + # final eval on dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, batch_size=args.batch_size, epoch=1, + shuffle=False)) + print("Final validation result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, "dev") + + # final eval on test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + print("Final test result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, "test") + + +if __name__ == '__main__': + print_arguments(args) + main(args) diff --git a/ERNIE/run_sequence_labeling.py b/ERNIE/run_sequence_labeling.py new file mode 100644 index 0000000000000000000000000000000000000000..71158cf219e1583ea6427448a947740cb86ecc06 --- /dev/null +++ b/ERNIE/run_sequence_labeling.py @@ -0,0 +1,277 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as fluid + +import reader.task_reader as task_reader +from model.ernie import ErnieConfig +from optimization import optimization +from utils.init import init_pretraining_params, init_checkpoint +from utils.args import print_arguments +from finetune.sequence_label import create_model, evaluate +from finetune_args import parser + + +args = parser.parse_args() + +def main(args): + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exe = fluid.Executor(place) + + reader = task_reader.SequenceLabelReader(vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + random_seed=args.random_seed) + + if not (args.do_train or args.do_val or args.do_test): + raise ValueError("For args `do_train`, `do_val` and `do_test`, at " + "least one of them must be True.") + + startup_prog = fluid.Program() + if args.random_seed is not None: + startup_prog.random_seed = args.random_seed + + if args.do_train: + train_data_generator = reader.data_generator( + input_file=args.train_set, + batch_size=args.batch_size, + epoch=args.epoch, + shuffle=True, + phase="train") + + num_train_examples = reader.get_num_examples(args.train_set) + + if args.in_tokens: + max_train_steps = args.epoch * num_train_examples // ( + args.batch_size // args.max_seq_len) // dev_count + else: + max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count + + warmup_steps = int(max_train_steps * args.warmup_proportion) + print("Device count: %d" % dev_count) + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + print("Num warmup steps: %d" % warmup_steps) + + train_program = fluid.Program() + + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, graph_vars = create_model( + args, + pyreader_name='train_reader', + ernie_config=ernie_config) + scheduled_lr = optimization( + loss=graph_vars["loss"], + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + use_fp16=args.use_fp16, + loss_scaling=args.loss_scaling) + + fluid.memory_optimize( + input_program=train_program, + skip_opt_set=[graph_vars["loss"].name, + graph_vars["labels"].name, + graph_vars["infers"].name, + graph_vars["seq_lens"].name + ]) + + if args.verbose: + if args.in_tokens: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, + batch_size=args.batch_size // args.max_seq_len) + else: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, batch_size=args.batch_size) + print("Theoretical memory usage in training: %.3f - %.3f %s" % + (lower_mem, upper_mem, unit)) + + if args.do_val or args.do_test: + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, graph_vars = create_model( + args, + pyreader_name='test_reader', + ernie_config=ernie_config) + + test_prog = test_prog.clone(for_test=True) + + exe.run(startup_prog) + + if args.do_train: + if args.init_checkpoint and args.init_pretraining_params: + print( + "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " + "both are set! Only arg 'init_checkpoint' is made valid.") + if args.init_checkpoint: + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.init_pretraining_params: + init_pretraining_params( + exe, + args.init_pretraining_params, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.do_val or args.do_test: + if not args.init_checkpoint: + raise ValueError("args 'init_checkpoint' should be set if" + "only doing validation or testing!") + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + + if args.do_train: + exec_strategy = fluid.ExecutionStrategy() + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = dev_count + exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope + + train_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=graph_vars["loss"].name, + exec_strategy=exec_strategy, + main_program=train_program) + + train_pyreader.decorate_tensor_provider(train_data_generator) + else: + train_exe = None + + if args.do_val or args.do_test: + test_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + main_program=test_prog, + share_vars_from=train_exe) + + if args.do_train: + train_pyreader.start() + steps = 0 + if warmup_steps > 0: + graph_vars["learning_rate"] = scheduled_lr + + time_begin = time.time() + while True: + try: + steps += 1 + if steps % args.skip_steps != 0: + train_exe.run(fetch_list=[]) + else: + outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count) + if args.verbose: + verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + ) + verbose += "learning rate: %f" % ( + outputs["lr"] if warmup_steps > 0 else args.learning_rate) + print(verbose) + + current_example, current_epoch = reader.get_train_progress( + ) + time_end = time.time() + used_time = time_end - time_begin + print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " + "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % + (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], outputs["f1"], + outputs["precision"], outputs["recall"], + args.skip_steps / used_time)) + time_begin = time.time() + + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, + "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + + if steps % args.validation_steps == 0: + # evaluate dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") + # evaluate test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") + + except fluid.core.EOFException: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + train_pyreader.reset() + break + + # final eval on dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, batch_size=args.batch_size, epoch=1, + shuffle=False)) + print("Final validation result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") + + # final eval on test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + shuffle=False)) + print("Final test result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") + + +if __name__ == '__main__': + print_arguments(args) + main(args) diff --git a/ERNIE/tokenization.py b/ERNIE/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..f906b537177dce430888fbc1738cd4b33906d705 --- /dev/null +++ b/ERNIE/tokenization.py @@ -0,0 +1,370 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file) + for num, line in enumerate(fin): + items = convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class CharTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in text.lower().split(" "): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/ERNIE/train.py b/ERNIE/train.py new file mode 100644 index 0000000000000000000000000000000000000000..470d34bc94a01d16629784019ed2562bc86b6e82 --- /dev/null +++ b/ERNIE/train.py @@ -0,0 +1,360 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ERNIE pretraining.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as fluid + +from reader.pretraining import ErnieDataReader +from model.ernie import ErnieModel, ErnieConfig +from optimization import optimization +from utils.args import ArgumentGroup, print_arguments +from utils.init import init_checkpoint, init_pretraining_params + +from pretrain_args import parser + +args = parser.parse_args() +# yapf: enable. + +def create_model(pyreader_name, ernie_config): + pyreader = fluid.layers.py_reader( + capacity=70, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], + [-1, 1], [-1, 1]], + dtypes=[ + 'int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64', + 'int64' + ], + lod_levels=[0, 0, 0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + + (src_ids, pos_ids, sent_ids, self_attn_mask, mask_label, mask_pos, labels, + next_sent_index) = fluid.layers.read_file(pyreader) + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + self_attn_mask=self_attn_mask, + config=ernie_config, + weight_sharing=args.weight_sharing, + use_fp16=args.use_fp16) + + next_sent_acc, mask_lm_loss, total_loss = ernie.get_pretraining_output( + mask_label, mask_pos, labels, next_sent_index) + + if args.use_fp16 and args.loss_scaling > 1.0: + total_loss *= args.loss_scaling + + return pyreader, next_sent_acc, mask_lm_loss, total_loss + + +def predict_wrapper(args, + exe, + ernie_config, + test_prog=None, + pyreader=None, + fetch_list=None): + # Context to do validation. + filelist = args.test_filelist if args.do_test else args.valid_filelist + data_reader = ErnieDataReader( + filelist, + vocab_path=args.vocab_path, + batch_size=args.batch_size, + voc_size=ernie_config['vocab_size'], + shuffle_files=False, + epoch=1, + max_seq_len=args.max_seq_len, + is_test=True) + + if args.do_test: + assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \ + to specify you pretrained model checkpoints" + + init_pretraining_params(exe, args.init_checkpoint, test_prog) + + def predict(exe=exe, pyreader=pyreader): + + pyreader.decorate_tensor_provider(data_reader.data_generator()) + pyreader.start() + + cost = 0 + lm_cost = 0 + acc = 0 + steps = 0 + time_begin = time.time() + while True: + try: + each_next_acc, each_mask_lm_cost, each_total_cost = exe.run( + fetch_list=fetch_list, program=test_prog) + acc += each_next_acc + lm_cost += each_mask_lm_cost + cost += each_total_cost + steps += 1 + if args.do_test and steps % args.skip_steps == 0: + print("[test_set] steps: %d" % steps) + + except fluid.core.EOFException: + pyreader.reset() + break + + used_time = time.time() - time_begin + return cost, lm_cost, acc, steps, (args.skip_steps / used_time) + + return predict + + +def test(args): + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + + test_prog = fluid.Program() + test_startup = fluid.Program() + with fluid.program_guard(test_prog, test_startup): + with fluid.unique_name.guard(): + test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + pyreader_name='test_reader', ernie_config=ernie_config) + + test_prog = test_prog.clone(for_test=True) + + place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(test_startup) + + predict = predict_wrapper( + args, + exe, + ernie_config, + test_prog=test_prog, + pyreader=test_pyreader, + fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name]) + + print("test begin") + loss, lm_loss, acc, steps, speed = predict() + print( + "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s" + % (np.mean(np.array(loss) / steps), + np.exp(np.mean(np.array(lm_loss) / steps)), + np.mean(np.array(acc) / steps), speed)) + + +def train(args): + print("pretraining start") + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + + train_program = fluid.Program() + startup_prog = fluid.Program() + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + pyreader_name='train_reader', ernie_config=ernie_config) + scheduled_lr = optimization( + loss=total_loss, + warmup_steps=args.warmup_steps, + num_train_steps=args.num_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + use_fp16=args.use_fp16, + loss_scaling=args.loss_scaling) + + fluid.memory_optimize( + input_program=train_program, + skip_opt_set=[ + next_sent_acc.name, mask_lm_loss.name, total_loss.name + ]) + + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + pyreader_name='test_reader', ernie_config=ernie_config) + + test_prog = test_prog.clone(for_test=True) + + if args.use_cuda: + place = fluid.CUDAPlace(0) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + print("Device count %d" % dev_count) + print("theoretical memory usage: ") + print(fluid.contrib.memory_usage( + program=train_program, batch_size=args.batch_size // args.max_seq_len)) + + nccl2_num_trainers = 1 + nccl2_trainer_id = 0 + print("args.is_distributed:", args.is_distributed) + if args.is_distributed: + worker_endpoints_env = os.getenv("worker_endpoints") + worker_endpoints = worker_endpoints_env.split(",") + trainers_num = len(worker_endpoints) + current_endpoint = os.getenv("current_endpoint") + trainer_id = worker_endpoints.index(current_endpoint) + if trainer_id == 0: + print("train_id == 0, sleep 60s") + time.sleep(60) + print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ + trainer_id:{}" + .format(worker_endpoints, trainers_num, + current_endpoint, trainer_id)) + + # prepare nccl2 env. + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile( + trainer_id, + trainers=worker_endpoints_env, + current_endpoint=current_endpoint, + program=train_program, + startup_program=startup_prog) + nccl2_num_trainers = trainers_num + nccl2_trainer_id = trainer_id + + exe = fluid.Executor(place) + exe.run(startup_prog) + + if args.init_checkpoint and args.init_checkpoint != "": + init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) + + data_reader = ErnieDataReader( + filelist=args.train_filelist, + batch_size=args.batch_size, + vocab_path=args.vocab_path, + voc_size=ernie_config['vocab_size'], + epoch=args.epoch, + max_seq_len=args.max_seq_len, + generate_neg_sample=args.generate_neg_sample) + + exec_strategy = fluid.ExecutionStrategy() + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = dev_count + exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) + + build_strategy = fluid.BuildStrategy() + build_strategy.remove_unnecessary_lock = False + + train_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=total_loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy, + main_program=train_program, + num_trainers=nccl2_num_trainers, + trainer_id=nccl2_trainer_id) + + if args.valid_filelist and args.valid_filelist != "": + predict = predict_wrapper( + args, + exe, + ernie_config, + test_prog=test_prog, + pyreader=test_pyreader, + fetch_list=[ + next_sent_acc.name, mask_lm_loss.name, total_loss.name + ]) + + train_pyreader.decorate_tensor_provider(data_reader.data_generator()) + train_pyreader.start() + steps = 0 + cost = [] + lm_cost = [] + acc = [] + time_begin = time.time() + while steps < args.num_train_steps: + try: + steps += nccl2_num_trainers + skip_steps = args.skip_steps * nccl2_num_trainers + + if nccl2_trainer_id != 0: + train_exe.run(fetch_list=[]) + continue + + if steps % skip_steps != 0: + train_exe.run(fetch_list=[]) + else: + each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( + fetch_list=[ + next_sent_acc.name, mask_lm_loss.name, total_loss.name, + scheduled_lr.name + ]) + acc.extend(each_next_acc) + lm_cost.extend(each_mask_lm_cost) + cost.extend(each_total_cost) + + print("feed_queue size", train_pyreader.queue.size()) + time_end = time.time() + used_time = time_end - time_begin + epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( + ) + print("current learning_rate:%f" % np_lr[0]) + print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " + "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" + % (epoch, current_file_index, total_file, steps, + np.mean(np.array(cost)), + np.mean(np.exp(np.array(lm_cost))), + np.mean(np.array(acc)), skip_steps / used_time, + current_file, mask_type)) + cost = [] + lm_cost = [] + acc = [] + time_begin = time.time() + + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + + if args.valid_filelist and steps % args.validation_steps == 0: + vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( + ) + print("[validation_set] epoch: %d, step: %d, " + "loss: %f, global ppl: %f, batch-averged ppl: %f, " + "next_sent_acc: %f, speed: %f steps/s" % + (epoch, steps, + np.mean(np.array(vali_cost) / vali_steps), + np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), + np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), + np.mean(np.array(vali_acc) / vali_steps), vali_speed)) + + except fluid.core.EOFException: + train_pyreader.reset() + break + + +if __name__ == '__main__': + print_arguments(args) + if args.do_test: + test(args) + else: + train(args) diff --git a/ERNIE/utils/__init__.py b/ERNIE/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ERNIE/utils/args.py b/ERNIE/utils/args.py new file mode 100644 index 0000000000000000000000000000000000000000..b9be634f0f383db61eb667df2345a89262179fd8 --- /dev/null +++ b/ERNIE/utils/args.py @@ -0,0 +1,48 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Arguments for configuration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import six +import argparse + + +def str2bool(v): + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, **kwargs): + type = str2bool if type == bool else type + self._group.add_argument( + "--" + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') diff --git a/ERNIE/utils/fp16.py b/ERNIE/utils/fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..e153c2b9a1029897def264278c5dbe72e1f369f5 --- /dev/null +++ b/ERNIE/utils/fp16.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle +import paddle.fluid as fluid + + +def cast_fp16_to_fp32(i, o, prog): + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={ + "in_dtype": fluid.core.VarDesc.VarType.FP16, + "out_dtype": fluid.core.VarDesc.VarType.FP32 + }) + + +def cast_fp32_to_fp16(i, o, prog): + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={ + "in_dtype": fluid.core.VarDesc.VarType.FP32, + "out_dtype": fluid.core.VarDesc.VarType.FP16 + }) + + +def copy_to_master_param(p, block): + v = block.vars.get(p.name, None) + if v is None: + raise ValueError("no param name %s found!" % p.name) + new_p = fluid.framework.Parameter( + block=block, + shape=v.shape, + dtype=fluid.core.VarDesc.VarType.FP32, + type=v.type, + lod_level=v.lod_level, + stop_gradient=p.stop_gradient, + trainable=p.trainable, + optimize_attr=p.optimize_attr, + regularizer=p.regularizer, + gradient_clip_attr=p.gradient_clip_attr, + error_clip=p.error_clip, + name=v.name + ".master") + return new_p + + +def create_master_params_grads(params_grads, main_prog, startup_prog, + loss_scaling): + master_params_grads = [] + tmp_role = main_prog._current_role + OpRole = fluid.core.op_proto_and_checker_maker.OpRole + main_prog._current_role = OpRole.Backward + for p, g in params_grads: + # create master parameters + master_param = copy_to_master_param(p, main_prog.global_block()) + startup_master_param = startup_prog.global_block()._clone_variable( + master_param) + startup_p = startup_prog.global_block().var(p.name) + cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog) + # cast fp16 gradients to fp32 before apply gradients + if g.name.find("layer_norm") > -1: + if loss_scaling > 1: + scaled_g = g / float(loss_scaling) + else: + scaled_g = g + master_params_grads.append([p, scaled_g]) + continue + master_grad = fluid.layers.cast(g, "float32") + if loss_scaling > 1: + master_grad = master_grad / float(loss_scaling) + master_params_grads.append([master_param, master_grad]) + main_prog._current_role = tmp_role + return master_params_grads + + +def master_param_to_train_param(master_params_grads, params_grads, main_prog): + for idx, m_p_g in enumerate(master_params_grads): + train_p, _ = params_grads[idx] + if train_p.name.find("layer_norm") > -1: + continue + with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): + cast_fp32_to_fp16(m_p_g[0], train_p, main_prog) diff --git a/ERNIE/utils/init.py b/ERNIE/utils/init.py new file mode 100644 index 0000000000000000000000000000000000000000..3844d01298ecbb70aed37b467aebca62caadd391 --- /dev/null +++ b/ERNIE/utils/init.py @@ -0,0 +1,81 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import six +import ast +import copy + +import numpy as np +import paddle.fluid as fluid + + +def cast_fp32_to_fp16(exe, main_program): + print("Cast parameters to float16 data format.") + for param in main_program.global_block().all_parameters(): + if not param.name.endswith(".master"): + param_t = fluid.global_scope().find_var(param.name).get_tensor() + data = np.array(param_t) + if param.name.find("layer_norm") == -1: + param_t.set(np.float16(data).view(np.uint16), exe.place) + master_param_var = fluid.global_scope().find_var(param.name + + ".master") + if master_param_var is not None: + master_param_var.get_tensor().set(data, exe.place) + + +def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + if not fluid.io.is_persistable(var): + return False + return os.path.exists(os.path.join(init_checkpoint_path, var.name)) + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + print("Load model from {}".format(init_checkpoint_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(pretraining_params_path, var.name)) + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=existed_params) + print("Load pretraining parameters from {}.".format( + pretraining_params_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program)