From e0e9f2e59353ece3c99490b36f64e32c6ebc8f56 Mon Sep 17 00:00:00 2001 From: PROoshio Date: Fri, 25 Jun 2021 17:56:10 +0800 Subject: [PATCH] release/v1 --- ernie-gram/README.md | 129 +++ ernie-gram/finetune/__init__.py | 0 ernie-gram/finetune/classifier.py | 553 ++++++++++++ ernie-gram/finetune/finetune_args.py | 124 +++ ernie-gram/finetune/mrc.py | 629 ++++++++++++++ ernie-gram/finetune/sequence_label.py | 249 ++++++ ernie-gram/lanch.py | 143 ++++ ernie-gram/model/__init__.py | 0 ernie-gram/model/ernie.py | 210 +++++ ernie-gram/model/optimization.py | 171 ++++ ernie-gram/model/transformer_encoder.py | 349 ++++++++ ernie-gram/reader/__init__.py | 0 ernie-gram/reader/batching.py | 248 ++++++ ernie-gram/reader/pretraining.py | 389 +++++++++ ernie-gram/reader/task_reader.py | 1028 +++++++++++++++++++++++ ernie-gram/reader/tokenization.py | 506 +++++++++++ ernie-gram/run.sh | 67 ++ ernie-gram/run_classifier.py | 432 ++++++++++ ernie-gram/run_mrc.py | 424 ++++++++++ ernie-gram/run_sequence_labeling.py | 337 ++++++++ ernie-gram/task_conf | 177 ++++ ernie-gram/utils/__init__.py | 0 ernie-gram/utils/args.py | 68 ++ ernie-gram/utils/cmrc2018_eval.py | 145 ++++ ernie-gram/utils/evaluate_v1.py | 92 ++ ernie-gram/utils/evaluate_v2.py | 285 +++++++ ernie-gram/utils/glue_data_process.sh | 76 ++ ernie-gram/utils/init.py | 72 ++ ernie-gram/utils/utils.sh | 39 + 29 files changed, 6942 insertions(+) create mode 100644 ernie-gram/README.md create mode 100644 ernie-gram/finetune/__init__.py create mode 100644 ernie-gram/finetune/classifier.py create mode 100644 ernie-gram/finetune/finetune_args.py create mode 100644 ernie-gram/finetune/mrc.py create mode 100644 ernie-gram/finetune/sequence_label.py create mode 100644 ernie-gram/lanch.py create mode 100644 ernie-gram/model/__init__.py create mode 100644 ernie-gram/model/ernie.py create mode 100644 ernie-gram/model/optimization.py create mode 100644 ernie-gram/model/transformer_encoder.py create mode 100644 ernie-gram/reader/__init__.py create mode 100644 ernie-gram/reader/batching.py create mode 100644 ernie-gram/reader/pretraining.py create mode 100644 ernie-gram/reader/task_reader.py create mode 100644 ernie-gram/reader/tokenization.py create mode 100644 ernie-gram/run.sh create mode 100644 ernie-gram/run_classifier.py create mode 100644 ernie-gram/run_mrc.py create mode 100644 ernie-gram/run_sequence_labeling.py create mode 100644 ernie-gram/task_conf create mode 100644 ernie-gram/utils/__init__.py create mode 100644 ernie-gram/utils/args.py create mode 100644 ernie-gram/utils/cmrc2018_eval.py create mode 100644 ernie-gram/utils/evaluate_v1.py create mode 100644 ernie-gram/utils/evaluate_v2.py create mode 100644 ernie-gram/utils/glue_data_process.sh create mode 100644 ernie-gram/utils/init.py create mode 100644 ernie-gram/utils/utils.sh diff --git a/ernie-gram/README.md b/ernie-gram/README.md new file mode 100644 index 0000000..d764f83 --- /dev/null +++ b/ernie-gram/README.md @@ -0,0 +1,129 @@ +## _ERNIE-Gram_: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding + +- [Proposed Methods](#proposed-methods) +- [Pre-trained Models](#pre-trained-models) +- [Fine-tuning on Downstream Tasks](#fine-tuning-on-downstream-tasks) + * [GLUE](#glue-benchmark) + * [SQuAD](#squad-benchmark) +- [Usage](#usage) + * [Install PaddlePaddle](#install-paddlepaddle) + * [Fine-tuning](#fine-tuning) + * [Employ Dynamic Computation Graph](#employ-dynamic-computation-graph) +- [Citation](#citation) +- [Communication](#communication) + +For technical description of the algorithm, please see our paper: +>[_**ERNIE-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding**_](https://www.aclweb.org/anthology/2021.naacl-main.136/) +> +>Dongling Xiao, Yu-Kun Li, Han Zhang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang +> +>Accepted by **NAACL-HLT 2021** + +![ERNIE-Gram](https://img.shields.io/badge/Pretraining-Language%20Understanding-green) ![GLUE](https://img.shields.io/badge/GLUE-The%20General%20Language%20Understanding%20Evaluation-yellow) ![SQuAD](https://img.shields.io/badge/SQuAD-The%20Stanford%20Question%20Answering-blue) ![RACE](https://img.shields.io/badge/RACE-The%20ReAding%20Comprehension%20from%20Examinations-green) +--- +**[ERNIE-Gram](https://www.aclweb.org/anthology/2021.naacl-main.136/)** is an **explicitly** n-gram masking and predicting method to eliminate the limitations of previous contiguously masking strategies and incorporate coarse-grained linguistic information into pre-training sufficiently. To model the intra-dependencies and inter-relation of coarse-grained linguistic information, n-grams are masked and predicted directly using explicit n-gram identities rather than contiguous sequences of n tokens. Furthermore, ERNIE-Gram employs a generator model to sample plausible n-gram identities as optional n-gram masks and predict them in both coarse-grained and fine-grained manners to enable comprehensive n-gram prediction and relation modeling. + +## Proposed Methods + +We construct three novel methods to model the intra-dependencies and inter-relation of coarse-grained linguistic information: + +- **Explicitly N-gram Masked Language Modeling**: n-grams are masked with single [MASK] symbols, and predicted directly using explicit n-gram identities rather than sequences of tokens. +- **Comprehensive N-gram Prediction**: masked n-grams are simultaneously predicted in coarse-grained (explicit n-gram identities) and fine-grained (contained token identities) manners. +- **Enhanced N-gram Relation Modeling**: n-grams are masked with plausible n-grams identities sampled from a generator model, and then recovered to the original n-grams. + +![ernie-gram](.meta/ernie-gram.png) + +## Pre-trained Models + +We release the checkpoints for **ERNIE-Gram _16G_** and **ERNIE-Gram _160G_** models which are pre-trained on the base-scale corpora (16GB text for BERT) and the large-scale corpora (160GB text for RoBERTa) respectively. + +- [**ERNIE-Gram _16G_**](https://ernie-github.cdn.bcebos.com/model-ernie-gram-en-16g.tar.gz) (_lowercased | 12-layer, 768-hidden, 12-heads, 110M parameters_) +- [**ERNIE-Gram _160G_**](https://ernie-github.cdn.bcebos.com/model-ernie-gram-en-160g.tar.gz) (_lowercased | 12-layer, 768-hidden, 12-heads, 110M parameters_) + + +## Fine-tuning on Downstream Tasks + +We compare the performance of [ERNIE-Gram](https://www.aclweb.org/anthology/2021.naacl-main.136/) with the existing SOTA pre-training models for natural language generation ([MPNet](https://arxiv.org/abs/2004.09297), [UniLMv2](https://arxiv.org/abs/2002.12804), [ELECTRA](https://arxiv.org/abs/2003.10555), [RoBERTa](https://arxiv.org/abs/1907.11692) and [XLNet](https://arxiv.org/abs/1906.08237)) on several language understanding tasks, including [GLUE benchmark](https://openreview.net/pdf?id=rJ4km2R5t7) (General Language Understanding Evaluation), [SQuAD](https://arxiv.org/abs/1606.05250) (Stanford Question Answering). + + +### GLUE benchmark +The General Language Understanding Evaluation ([GLUE](https://openreview.net/pdf?id=rJ4km2R5t7)) is a multi-task benchmark consisting of various NLU tasks, which contains 1) pairwise classification tasks like language inference [MNLI](https://www.aclweb.org/anthology/N18-1101), [RTE](http://dx.doi.org/10.1007/11736790_9)), question answering (QNLI) and paraphrase detection (QQP, [MRPC](https://www.aclweb.org/anthology/I05-5002)), 2) single-sentence classification tasks like linguistic acceptability ([CoLA](https://www.aclweb.org/anthology/Q19-1040)), sentiment +analysis ([SST-2](https://www.aclweb.org/anthology/D13-1170)) and 3) text similarity task ([STS-B](https://www.aclweb.org/anthology/S17-2001)). + +The results on GLUE are presented as follows: + +|Tasks| MNLI | QNLI | QQP | SST-2 | CoLA | MRPC | RTE | STS-B | AVG | +| :--------| :------: | :------: | :------: | :------: | :------: | :------: | :------: | :------: | :------: | +|Metrics| ACC | ACC | ACC | ACC | MCC | ACC | ACC | PCC | AVG | +| XLNet |86.8|91.7|91.4|94.7|60.2|88.2|74.0|89.5|84.5| +| RoBERTa |87.6|92.8|91.9|94.8|63.6|90.2|78.7|91.2|86.4| +| ELECTRA |88.8|93.2|91.5|95.2|67.7|89.5|82.7|91.2|87.5| +| UniLMv2 |88.5|**93.5**|91.7|95.1|65.2|**91.8**|81.3|91.0|87.3| +| MPNet |88.5|93.3|91.9|95.4|65.0|91.5|**85.2**|90.9|87.7| +| **ERNIE-Gram** |**89.1**|93.2|**92.2**|**95.6**|**68.6**|90.7|83.8|**91.3**|**88.1**| + +Download the [GLUE data](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to some directory `${TASK_DATA_PATH}` + +After the dataset is downloaded, you should run `sh ./utils/glue_data_process.sh $TASK_DATA_PATH` to convert the data format for training. If everything goes well, there will be a folder named `data` created with all the converted datas in it. + +### SQuAD benchmark +The Stanford Question Answering (SQuAD) tasks are designed to extract the answer span within the given passage conditioned on the question. We conduct experiments on [SQuAD1.1](https://www.aclweb.org/anthology/D16-1264) and [SQuAD2.0](https://www.aclweb.org/anthology/P18-2124) by adding a classification layer on the sequence outputs of ERNIE-Gram and predicting whether each token is the start or end position of the answer span. + +The results on SQuAD are presented as follows: + +| Tasks | SQuADv1 | SQuADv2 | +| :-------------------------------------------------------- | :----------------------------: | :----------------------: | +| Metrics | EM / F1 | EM / F1 | +| RoBERTa |84.6 / 91.5|80.5 / 83.7| +| XLNet |- / - | 80.2 / -| +| ELECTRA |86.8 / - | 80.5 / -| +| MPNet |86.8 / 92.5 | 82.8 / 85.6| +| UniLMv2 |87.1 / 93.1 | 83.3 / 86.1| +| **ERNIE-Gram** |**87.2** / **93.2** | **84.1** / **87.1**| + +The preprocessed data for SQuAD can be downloaded from [SQuADv1](https://ernie-github.cdn.bcebos.com/data-SQuADv1.tar.gz) and [SQuADv2](https://ernie-github.cdn.bcebos.com/data-SQuADv2.tar.gz). Please unpack them to `./data`. + +The preprocessed data for tasks involving long text can be downloaded from [RACE](https://ernie-github.cdn.bcebos.com/data-RACE.tar.gz), [IMDB](https://ernie-github.cdn.bcebos.com/data-IMDB.tar.gz) and [AG'news](https://ernie-github.cdn.bcebos.com/data-AG.tar.gz). Please unpack them to `./data`. + +## Usage + +### Install PaddlePaddle + +This code base has been tested with PaddlePaddle 2.0.0+, You can install PaddlePaddle follow [this site](https://www.paddlepaddle.org.cn/install/quick). + +### Fine-tuning +Please update LD_LIBRARY_PATH about CUDA, cuDNN, NCCL2 before running ERNIE-Gram. We have put the parameter configurations of the finetuning tasks in `./task_conf`. You can easily run finetuning through these configuration files. For example, you can finetune ERNIE-Gram model on RTE by +```script +TASK="RTE" # MNLI, SST-2, CoLA, SQuADv1..., please see ./task_conf +MODEL_PATH="./ernie-gram-160g" #path for pre-trained models +sh run.sh ${TASK} ${MODEL_PATH} +``` +The log of training and the evaluation results are in `log/*job.log.0`. To finetune on your own task data, you can refer to the data format we provide for processing your data. + + + +### Employ Dynamic Computation Graph + +The ERNIE-Gram-zh code using dynamic graph is more concise and flexible, please refer to [ERNIE-Gram Dygraph](https://github.com/PaddlePaddle/ERNIE/tree/develop/ernie-gram) for specific use. + +## Citation + +You can cite the paper as below: + +``` +@article{xiao2021ernie-gram, + title={ERNIE-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding}, + author={Xiao, Dongling and Li, Yukun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng}, + journal={arXiv preprint arXiv:2010.12148}, + year={2021} +} +``` + +## Communication + +- [ERNIE homepage](https://wenxin.baidu.com/) +- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc. +- QQ discussion group: 760439550 (ERNIE discussion group). +- QQ discussion group: 958422639 (ERNIE discussion group-v2). +- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. + diff --git a/ernie-gram/finetune/__init__.py b/ernie-gram/finetune/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ernie-gram/finetune/classifier.py b/ernie-gram/finetune/classifier.py new file mode 100644 index 0000000..39cf031 --- /dev/null +++ b/ernie-gram/finetune/classifier.py @@ -0,0 +1,553 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model for classifier.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +import six +import subprocess +import numpy as np +import json +from scipy.stats import pearsonr, spearmanr +from collections import defaultdict +from six.moves import xrange +import paddle.fluid as fluid + +from model.ernie import ErnieModel + +if six.PY2: + import commands as subprocess + + +def create_model(args, pyreader_name, ernie_config, is_prediction=False, is_classify=False, is_regression=False, for_race=False, has_fc=True): + + shapes = [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], + [-1, 1], [-1, args.max_seq_len, args.max_seq_len, 1]] + dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'] + lod_levels=[0, 0, 0, 0, 0, 0, 0, 0] + if is_regression: + dtypes[-3] = 'float32' + + if for_race: + shapes.append([-1, 1]) + dtypes.append('float32') + lod_levels.append(0) + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=shapes, + dtypes=dtypes, + lod_levels=lod_levels, + name=pyreader_name, + use_double_buffer=True) + if for_race: + (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, + qids, rel_pos_scaler, labels_pair) = fluid.layers.read_file(pyreader) + else: + (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, + qids, rel_pos_scaler) = fluid.layers.read_file(pyreader) + + checkpoints = [] + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=[pos_ids, rel_pos_scaler], + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=ernie_config, + use_fp16=args.use_fp16) + checkpoints.extend(ernie.get_checkpoints()) + + cls_feats = ernie.get_pooled_output(has_fc) + cls_feats = fluid.layers.dropout( + x=cls_feats, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + size = 1 if for_race else args.num_labels # for race dataset + logits = fluid.layers.fc( + input=cls_feats, + size=size, + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.))) + if for_race: + loss_pair = fluid.layers.sigmoid_cross_entropy_with_logits(logits, labels_pair) + logits = fluid.layers.reshape(logits, [-1, 4]) + + + if is_prediction: + probs = fluid.layers.softmax(logits) + feed_targets_name = [ + src_ids.name, pos_ids.name, sent_ids.name, task_ids.name, input_mask.name + ] + return pyreader, probs, feed_targets_name + + num_seqs = fluid.layers.create_tensor(dtype='int64') + if is_classify: + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=logits, label=labels, return_softmax=True) + + loss = fluid.layers.mean(x=ce_loss) + if for_race: + loss += 0.5 * fluid.layers.mean(x=loss_pair) + + accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) + + graph_vars = { + "loss": loss, + "probs": probs, + "accuracy": accuracy, + "labels": labels, + "num_seqs": num_seqs, + "qids": qids, + "checkpoints": checkpoints + } + elif is_regression: + if False: + logits = fluid.layers.sigmoid(logits) + cost = fluid.layers.square_error_cost(input=logits, label=labels) + loss = fluid.layers.mean(x=cost) + graph_vars = { + "loss": loss, + "probs": logits, + "labels": labels, + "num_seqs": num_seqs, + "qids": qids + } + + + for k, v in graph_vars.items(): + if k != "checkpoints": + v.persistable = True + + return pyreader, graph_vars + +def write_result(output_path, eval_phase, gpu_id, eval_index, save_lists=None): + outfile = output_path + "/" + eval_phase + if eval_index is not None: + outfile_part = outfile + ".part" + str(gpu_id) + writer = open(outfile_part, "w") + write_content = "\t".join([str(i) for i in eval_index]) + "\n" + writer.write(write_content) + writer.close() + if save_lists is not None: + save_list_name = ["qids", "labels", "scores"] + for idx in range(len(save_list_name)): + save_list = json.dumps(save_lists[idx]) + savefile_part = outfile + "." + save_list_name[idx] + ".part." + str(gpu_id) + list_writer = open(savefile_part, "w") + list_writer.write(save_list) + list_writer.close() + tmp_writer = open(output_path + "/" + eval_phase + "_dec_finish." + str(gpu_id), "w") + tmp_writer.close() + + +def concat_result(output_path, eval_phase, dev_count, num_eval_index, num_list=None, eval_span=None): + outfile = output_path + "/" + eval_phase + eval_index_all = [0.0] * num_eval_index + eval_list_all = defaultdict(list) + while True: + _, ret = subprocess.getstatusoutput('find ' + output_path + \ + ' -maxdepth 1 -name ' + eval_phase + '"_dec_finish.*"') + ret = ret.split("\n") + if len(ret) != dev_count: + time.sleep(1) + continue + + for dev_cnt in range(dev_count): + if not eval_span: + fin = open(outfile + ".part" + str(dev_cnt)) + cur_eval_index_all = fin.readline().strip().split("\t") + cur_eval_index_all = [float(i) for i in cur_eval_index_all] + eval_index_all = list(map(lambda x :x[0]+x[1], zip(eval_index_all, cur_eval_index_all))) + + if num_list is not None: + save_list_name = ["qids", "labels", "scores"] + for idx in range(len(save_list_name)): + fin_list = open(outfile + "." + save_list_name[idx] + ".part." + str(dev_cnt), "r") + eval_list_all[save_list_name[idx]].extend(json.loads(fin_list.read())) + + subprocess.getstatusoutput("rm " + outfile + ".*part*") + subprocess.getstatusoutput("rm " + output_path + "/" + eval_phase + "_dec_finish.*") + break + if num_list is not None: + return eval_list_all + return eval_index_all + +def merge_results(qids, labels, scores): + dic = {} + corr = 0 + for ind, qid in enumerate(map(str, qids)): + if qid in dic: + dic[qid]["scores"].append(scores[ind]) + else: + dic[qid] = {} + dic[qid]["labels"] = labels[ind] + dic[qid]["scores"] = [scores[ind]] + for qid in dic.keys(): + score = dic[qid]["scores"] + pred = list(map(lambda i:(max(score[i]), score[i].index(max(score[i]))), range(len(score)))) + pred = sorted(pred, key=lambda x:x[0], reverse=True)[0][1] + + if pred == dic[qid]["labels"]: + corr += 1 + return float(corr) / len(dic.keys()), len(dic.keys()) + +def evaluate_regression(exe, + test_program, + test_pyreader, + graph_vars, + eval_phase, + tag_num=None, + dev_count=1, + metric='pearson_and_spearman'): + + if eval_phase == "train": + # train_fetch_list = [graph_vars["loss"].name, graph_vars["num_seqs"].name] + # if "learning_rate" in graph_vars: + # train_fetch_list.append(graph_vars["learning_rate"].name) + train_fetch_list = [graph_vars["loss"].name] + if "learning_rate" in graph_vars: + train_fetch_list.append(graph_vars["learning_rate"].name) + outputs = exe.run(fetch_list=train_fetch_list) + ret = {"loss": np.mean(outputs[0])} + if "learning_rate" in graph_vars: + ret["learning_rate"] = float(outputs[1][0]) + return ret + + test_pyreader.start() + total_cost, total_num_seqs = 0.0, 0.0 + qids, labels, scores = [], [], [] + + fetch_list = [ + graph_vars["loss"].name, + graph_vars["probs"].name, + graph_vars["labels"].name, + graph_vars["qids"].name + ] + + time_begin = time.time() + while True: + try: + if dev_count == 1: + np_loss, np_probs, np_labels, np_qids = exe.run( + program=test_program, fetch_list=fetch_list) + else: + np_loss, np_probs, np_labels, np_qids = exe.run( + fetch_list=fetch_list) + #total_cost += np.sum(np_loss * np_num_seqs) + #total_num_seqs += np.sum(np_num_seqs) + labels.extend(np_labels.reshape((-1)).tolist()) + if np_qids is None: + qids.extend(list(range(len(np_labels)))) + else: + qids.extend(np_qids.reshape(-1).tolist()) + scores.extend(np_probs.reshape(-1).tolist()) + except fluid.core.EOFException: + test_pyreader.reset() + break + time_end = time.time() + + #cost = total_cost / total_num_seqs + elapsed_time = time_end - time_begin + + meta = {} + best_thre = None + if metric == 'pearson_and_spearman': + ret = pearson_and_spearman(scores, labels) + meta['score'] = ret['pearson'] + print("[%s evaluation] ave loss: %f, pearson: %f, spearman: %f, corr: %f, elapsed time: %f s" \ + % (eval_phase, 0.0, ret['pearson'], ret['spearmanr'], ret['corr'], elapsed_time)) + elif metric == 'matthews_corrcoef': + best_score = -1000000 + best_thresh = None + scores = np.array(scores) + scores = 1 / (1 + np.exp(-scores)) + for s in range(0, 1000): + T = s / 1000.0 + pred = (scores > T).astype('int') + matt_score = matthews_corrcoef(pred, labels) + if matt_score > best_score: + best_score = matt_score + best_thre = T + print("[%s evaluation] ave loss: %f, matthews_corrcoef: %f, data_num: %d, elapsed time: %f s, best_thres: %f" % (eval_phase, 0.0, best_score, total_num_seqs, elapsed_time, best_thre)) + else: + raise ValueError('unsupported metric {}'.format(metric)) + + #return {'best_thre': best_thre}, evaluate_info + + + +def evaluate_classify(exe, test_program, test_pyreader, graph_vars, eval_phase, + use_multi_gpu_test=False, gpu_id=0, output_path="./tmpout", dev_count=1, metric='simple_accuracy', eval_span=False): + train_fetch_list = [ + graph_vars["loss"].name, graph_vars["accuracy"].name, + graph_vars["num_seqs"].name + ] + + if eval_phase == "train": + outputs = exe.run(fetch_list=train_fetch_list) + ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])} + return ret + + test_pyreader.start() + total_cost, total_acc, total_num_seqs, tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 + qids, labels, scores, preds = [], [], [], [] + time_begin = time.time() + + fetch_list = [ + graph_vars["loss"].name, graph_vars["accuracy"].name, + graph_vars["probs"].name, graph_vars["labels"].name, + graph_vars["num_seqs"].name, graph_vars["qids"].name + ] + while True: + try: + np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run( + program=test_program, fetch_list=fetch_list) + total_cost += np.sum(np_loss * np_num_seqs) + total_acc += np.sum(np_acc * np_num_seqs) + total_num_seqs += np.sum(np_num_seqs) + labels.extend(np_labels.reshape((-1)).tolist()) + if np_qids is None: + qids.extend(list(range(len(np_labels)))) + else: + qids.extend(np_qids.reshape(-1).tolist()) + scores.extend(np_probs.tolist()) + np_preds = np.argmax(np_probs, axis=1).astype(np.float32) + preds.extend(np_preds.reshape((-1)).tolist()) + tp += np.sum((np_labels == 1) & (np_preds == 1)) + tn += np.sum((np_labels == 0) & (np_preds == 0)) + fp += np.sum((np_labels == 0) & (np_preds == 1)) + fn += np.sum((np_labels == 1) & (np_preds == 0)) + + except fluid.core.EOFException: + test_pyreader.reset() + break + time_end = time.time() + if True: + if dev_count == 1: + meta = {} + evaluate_info = "" + if metric == 'acc_and_f1': + ret = acc_and_f1(preds, labels) + print("[%s evaluation] ave loss: %f, ave_acc: %f, f1: %f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, total_cost / total_num_seqs, ret['acc'], ret['f1'], total_num_seqs, time_end - time_begin)) + meta['score'] = ret['f1'] + + elif metric == 'matthews_corrcoef': + ret = matthews_corrcoef(preds, labels) + print("[%s evaluation] ave loss: %f, matthews_corrcoef: %f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, total_cost / total_num_seqs, ret, total_num_seqs, time_end - time_begin)) + meta['score'] = ret + + elif metric == 'matthews_corrcoef_and_accf1': + + mat_ret = matthews_corrcoef(preds, labels) + sim_ret = acc_and_f1(preds, labels) + + evaluate_info = "[%s evaluation] ave loss: %f, matthews_corrcoef: %f, acc: %f, f1: %f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, cost, mat_ret, sim_ret['acc'], sim_ret['f1'], total_num_seqs, elapsed_time) + + meta['score'] = mat_ret + + elif metric == 'pearson_and_spearman': + ret = pearson_and_spearman(scores, labels) + print("[%s evaluation] ave loss: %f, pearson:%f, spearman:%f, corr:%f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, total_cost / total_num_seqs, ret['pearson'], ret['spearman'], ret['corr'], total_num_seqs, time_end - time_begin)) + meta['score'] = (ret['pearson'] + ret['spearman']) / 2.0 + + elif metric == 'simple_accuracy': + ret = simple_accuracy(preds, labels) + print("[%s evaluation] ave loss: %f, acc:%f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, total_cost / total_num_seqs, ret, total_num_seqs, time_end - time_begin)) + meta['score'] = ret + + elif metric == "acc_and_f1_and_mrr": + ret_a = acc_and_f1(preds, labels) + preds = sorted(zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1])) + ret_b = evaluate_mrr(preds) + evaluate_info = "[%s evaluation] ave loss: %f, acc: %f, f1: %f, mrr: %f, data_num: %d, elapsed time: %f s" \ + % (eval_phase, cost, ret_a['acc'], ret_a['f1'], ret_b, total_num_seqs, elapsed_time) + meta['score'] = ret_a['f1'] + else: + raise ValueError('unsupported metric {}'.format(metric)) + + else: + if metric== 'simple_accuracy': + if not eval_span: + write_result(output_path, eval_phase, gpu_id, [total_acc, total_num_seqs]) + if gpu_id == 0: + acc_sum, data_num = concat_result(output_path, eval_phase, dev_count, 2) + print( + "[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, acc_sum / data_num, + int(data_num), time_end - time_begin)) + else: + write_result(output_path, eval_phase, gpu_id, None, [qids, labels, scores]) + if gpu_id == 0: + ret = concat_result(output_path, eval_phase, dev_count, 0, 1, True) + qids, labels, scores = ret["qids"], ret["labels"], ret["scores"] + acc, data_num = merge_results(qids, labels, scores) + print( + "[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, acc, + int(data_num), time_end - time_begin)) + + + elif metric== 'matthews_corrcoef': + write_result(output_path, eval_phase, gpu_id, [tp, tn, fp, fn]) + if gpu_id == 0: + tp, tn, fp, fn = concat_result(output_path, eval_phase, dev_count, 2) + mcc = ( (tp*tn)-(fp*fn)) / np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn) ) + print( + "[%s evaluation] ave loss: %f, ave mcc: %f, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, mcc, time_end - time_begin)) + else: + is_print = True + if dev_count > 1: + is_print = False + write_result(output_path, eval_phase, gpu_id, [total_correct_num, total_label_pos_num, total_pred_pos_num], [qids, labels, scores]) + if gpu_id == 0: + is_print = True + eval_index_all, eval_list_all = concat_result(output_path, eval_phase, dev_count, 3, 3) + total_correct_num, total_label_pos_num, total_pred_pos_num = eval_index_all + qids, labels, scores = [eval_list_all[name] for name in ["qids", "labels", "scores"]] + + if is_print: + r = total_correct_num / total_label_pos_num + p = total_correct_num / total_pred_pos_num + f = 2 * p * r / (p + r) + + assert len(qids) == len(labels) == len(scores) + preds = sorted( + zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1])) + mrr = evaluate_mrr(preds) + map = evaluate_map(preds) + + print( + "[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, + total_acc / total_num_seqs, mrr, map, p, r, f, total_num_seqs, + time_end - time_begin)) + +def simple_accuracy(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + return (preds == labels).mean() + + +def matthews_corrcoef(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + tp = np.sum((labels == 1) & (preds == 1)) + tn = np.sum((labels == 0) & (preds == 0)) + fp = np.sum((labels == 0) & (preds == 1)) + fn = np.sum((labels == 1) & (preds == 0)) + + mcc = ( (tp*tn)-(fp*fn)) / np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn) ) + return mcc + +def pearson_and_spearman(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + +def evaluate_mrr(preds): + last_qid = None + total_mrr = 0.0 + qnum = 0.0 + rank = 0.0 + correct = False + for qid, score, label in preds: + if qid != last_qid: + rank = 0.0 + qnum += 1 + correct = False + last_qid = qid + + rank += 1 + if not correct and label != 0: + total_mrr += 1.0 / rank + correct = True + + return total_mrr / qnum + + +def evaluate_map(preds): + def singe_map(st, en): + total_p = 0.0 + correct_num = 0.0 + for index in xrange(st, en): + if int(preds[index][2]) != 0: + correct_num += 1 + total_p += correct_num / (index - st + 1) + if int(correct_num) == 0: + return 0.0 + return total_p / correct_num + + last_qid = None + total_map = 0.0 + qnum = 0.0 + st = 0 + for i in xrange(len(preds)): + qid = preds[i][0] + if qid != last_qid: + qnum += 1 + if last_qid != None: + total_map += singe_map(st, i) + st = i + last_qid = qid + + total_map += singe_map(st, len(preds)) + return total_map / qnum + +def acc_and_f1(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + + acc = simple_accuracy(preds, labels) + f1 = f1_score(preds, labels) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + +def f1_score(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + + tp = np.sum((labels == 1) & (preds == 1)) + tn = np.sum((labels == 0) & (preds == 0)) + fp = np.sum((labels == 0) & (preds == 1)) + fn = np.sum((labels == 1) & (preds == 0)) + p = tp / (tp+fp) + r = tp / (tp+fn) + f1 = (2*p*r) / (p+r+1e-8) + return f1 + + diff --git a/ernie-gram/finetune/finetune_args.py b/ernie-gram/finetune/finetune_args.py new file mode 100644 index 0000000..9db2207 --- /dev/null +++ b/ernie-gram/finetune/finetune_args.py @@ -0,0 +1,124 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + + +import os +import time +import argparse + +from utils.args import ArgumentGroup + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +model_g = ArgumentGroup(parser, "model", "model configuration and paths.") +model_g.add_arg("ernie_config_path", str, None, "Path to the json file for ernie model config.") +model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") +model_g.add_arg("init_pretraining_params", str, None, + "Init pre-training params which preforms fine-tuning from. If the " + "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") +model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.") + +model_g.add_arg("is_classify", bool, True, "is_classify") +model_g.add_arg("is_regression", bool, False, "is_regression") +model_g.add_arg("eval_span", bool, False, "evaluate task involing long text") +model_g.add_arg("task_id", int, 0, "task id") + +train_g = ArgumentGroup(parser, "training", "training options.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") +train_g.add_arg("layer_wise_decay_rate", float, 0.8, "Layer-wise learning decay rate used to train.") +train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", + "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("warmup_proportion", float, 0.1, + "Proportion of training steps to perform linear learning rate warmup for.") +train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") +train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") +train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.") +train_g.add_arg("use_dynamic_loss_scaling", bool, False, "Whether to use dynamic loss scaling.") +train_g.add_arg("init_loss_scaling", float, 1.0, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") +train_g.add_arg("test_save", str, "./checkpoints/test_result", "test_save") +train_g.add_arg("metric", str, "simple_accuracy", "metric") +train_g.add_arg("incr_every_n_steps", int, 100, "Increases loss scaling every n consecutive.") +train_g.add_arg("decr_every_n_nan_or_inf", int, 2, + "Decreases loss scaling every n accumulated steps with nan or inf gradients.") +train_g.add_arg("incr_ratio", float, 2.0, + "The multiplier to use when increasing the loss scaling.") +train_g.add_arg("decr_ratio", float, 0.8, + "The less-than-one-multiplier to use when decreasing.") +train_g.add_arg("run_file_path", str, None, "Number of epoches for fine-tuning.") +train_g.add_arg("use_recompute", bool, False, "Whether to use recompute.") +train_g.add_arg("use_fuse", bool, False, "Whether to use fuse_allreduce_ops.") +train_g.add_arg("nccl_comm_num", int, 1, "NCCL comm num.") +train_g.add_arg("hierarchical_allreduce_inter_nranks", int, 8, "Hierarchical allreduce inter ranks.") +train_g.add_arg("use_hierarchical_allreduce", bool, False, "Use hierarchical allreduce or not.") +train_g.add_arg("version_2", bool, False, "Squad v2.") +train_g.add_arg("for_race", bool, False, "For RACE dataset.") +train_g.add_arg("has_fc", bool, True, "Apply linear fc for classification.") + +log_g = ArgumentGroup(parser, "logging", "logging related.") +log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") +log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") + +data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") +data_g.add_arg("tokenizer", str, "FullTokenizer", + "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer") +data_g.add_arg("train_set", str, None, "Path to training data.") +data_g.add_arg("test_set", str, None, "Path to test data.") +data_g.add_arg("dev_set", str, None, "Path to validation data.") +data_g.add_arg("vocab_path", str, None, "Vocabulary path.") +data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("predict_batch_size", int, None, "Total examples' number in batch for predict. see also --in_tokens.") +data_g.add_arg("in_tokens", bool, False, + "If set, the batch size will be the maximum number of tokens in one batch. " + "Otherwise, it will be the maximum number of examples in one batch.") +data_g.add_arg("do_lower_case", bool, True, + "Whether to lower case the input text. Should be True for uncased models and False for cased models.") +data_g.add_arg("random_seed", int, None, "Random seed.") +data_g.add_arg("label_map_config", str, None, "label_map_path.") +data_g.add_arg("num_labels", int, 2, "label number") +data_g.add_arg("diagnostic", str, None, "GLUE Diagnostic Dataset") +data_g.add_arg("diagnostic_save", str, None, "GLUE Diagnostic save f") +data_g.add_arg("max_query_length", int, 64, "Max query length.") +data_g.add_arg("max_answer_length", int, 30, "Max answer length.") +data_g.add_arg("doc_stride", int, 128, + "When splitting up a long document into chunks, how much stride to take between chunks.") +data_g.add_arg("n_best_size", int, 20, + "The total number of n-best predictions to generate in the nbest_predictions.json output file.") +data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme") + +run_type_g = ArgumentGroup(parser, "run_type", "running type options.") +run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") +run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.") +run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).") +run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.") +run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") +run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") +run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") +run_type_g.add_arg("use_multi_gpu_test", bool, False, "Whether to perform evaluation using multiple gpu cards") +run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") +run_type_g.add_arg("stream_job", str, None, "if not None, then stream finetuning task by job id.") +run_type_g.add_arg("shuffle", bool, True, "") +run_type_g.add_arg("for_cn", bool, True, "model train for cn or for other langs.") + +parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.") +# yapf: enable + diff --git a/ernie-gram/finetune/mrc.py b/ernie-gram/finetune/mrc.py new file mode 100644 index 0000000..ac45c13 --- /dev/null +++ b/ernie-gram/finetune/mrc.py @@ -0,0 +1,629 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model for classifier.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +import numpy as np +import os +import math +import json +import collections +import six +import subprocess + +from scipy.stats import pearsonr, spearmanr +from six.moves import xrange +import paddle.fluid as fluid + +from model.ernie import ErnieModel +import reader.tokenization as tokenization + +if six.PY2: + import commands as subprocess + +def create_model(args, pyreader_name, ernie_config, is_training): + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1], [-1, args.max_seq_len, args.max_seq_len, 1]], + dtypes=[ + 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64', 'int64'], + lod_levels=[0, 0, 0, 0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + (src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions, + end_positions, unique_id, rel_pos_scaler) = fluid.layers.read_file(pyreader) + checkpoints = [] + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=[pos_ids, rel_pos_scaler], + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=ernie_config, + use_fp16=args.use_fp16, + has_sent_emb=True) + checkpoints.extend(ernie.get_checkpoints()) + + enc_out = ernie.get_sequence_output() + enc_out = fluid.layers.dropout( + x=enc_out, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + + logits = fluid.layers.fc( + input=enc_out, + size=2, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_mrc_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_mrc_out_b", initializer=fluid.initializer.Constant(0.))) + print(enc_out.shape, logits.shape) + + logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) + start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) + print(logits.shape, start_logits.shape, end_logits.shape) + #input_mask = fluid.layers.flatten(input_mask, axis=1) + #mask_bias = (input_mask - 1.0) * 1e7 + #start_logits += mask_bias + #end_logits += mask_bias + + batch_ones = fluid.layers.fill_constant_batch_size_like( + input=start_logits, dtype='int64', shape=[1], value=1) + print(batch_ones.shape) + num_seqs = fluid.layers.reduce_sum(input=batch_ones) + + + def compute_loss(logits, positions): + loss = fluid.layers.softmax_with_cross_entropy( + logits=logits, label=positions) + loss = fluid.layers.mean(x=loss) + return loss + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + loss = (start_loss + end_loss) / 2.0 + if args.use_fp16 and args.loss_scaling > 1.0: + loss *= args.loss_scaling + + graph_vars = { + "loss": loss, + "num_seqs": num_seqs, + "unique_id": unique_id, + "start_logits": start_logits, + "end_logits": end_logits, + "checkpoints": checkpoints + } + + for k, v in graph_vars.items(): + if k != "checkpoints": + v.persistable = True + + return pyreader, graph_vars + + +def write_result(output_path, eval_phase, gpu_id, all_results): + outfile = output_path + "/" + eval_phase + outfile_part = outfile + ".part" + str(gpu_id) + writer = open(outfile_part, "w") + save_dict = json.dumps(all_results) + writer.write(save_dict) + writer.close() + tmp_writer = open(output_path + "/" + eval_phase + "_dec_finish." + str(gpu_id), "w") + tmp_writer.close() + + +def concat_result(output_path, eval_phase, dev_count, RawResult): + outfile = output_path + "/" + eval_phase + all_results_read = [] + while True: + _, ret = subprocess.getstatusoutput('find ' + output_path + \ + ' -maxdepth 1 -name ' + eval_phase + '"_dec_finish.*"') + ret = ret.split("\n") + if len(ret) != dev_count: + time.sleep(1) + continue + + for dev_cnt in range(dev_count): + fin_read = open(outfile + ".part" + str(dev_cnt), "rb") + cur_rawresult = json.loads(fin_read.read()) + for tp in cur_rawresult: + assert len(tp) == 3 + all_results_read.append( + RawResult( + unique_id=tp[0], + start_logits=tp[1], + end_logits=tp[2])) + + subprocess.getstatusoutput("rm " + outfile + ".*part*") + subprocess.getstatusoutput("rm " + output_path + "/" + eval_phase + "_dec_finish.*") + break + + return all_results_read + + +def evaluate(exe, + test_program, + test_pyreader, + graph_vars, + eval_phase, + tag_num=None, + examples=None, + features=None, + args=None, + use_multi_gpu_test=False, + gpu_id=0, + dev_count=1, + output_path="./tmpout", + tokenizer=None, + version_2_with_negative=False): + if eval_phase == "train": + train_fetch_list = [graph_vars["loss"].name] + if "learning_rate" in graph_vars: + train_fetch_list.append(graph_vars["learning_rate"].name) + outputs = exe.run(fetch_list=train_fetch_list) + ret = {"loss": np.mean(outputs[0])} + if "learning_rate" in graph_vars: + ret["learning_rate"] = float(outputs[1][0]) + return ret + + output_dir = output_path + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_prediction_file = os.path.join(output_dir, eval_phase + "_predictions.json") + output_nbest_file = os.path.join(output_dir, eval_phase + "_nbest_predictions.json") + if version_2_with_negative: + output_null_log_odds_file = os.path.join(output_dir, eval_phase + "_null_odds.json") + else: + output_null_log_odds_file = None + + RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + test_pyreader.start() + all_results = [] + time_begin = time.time() + + fetch_list = [ + graph_vars["unique_id"].name, graph_vars["start_logits"].name, + graph_vars["end_logits"].name, graph_vars["num_seqs"].name + ] + while True: + try: + np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = exe.run( + program=test_program, fetch_list=fetch_list) + for idx in range(np_unique_ids.shape[0]): + if len(all_results) % 1000 == 0: + print("Processing example: %d" % len(all_results)) + unique_id = int(np_unique_ids[idx]) + start_logits = [float(x) for x in np_start_logits[idx].flat] + end_logits = [float(x) for x in np_end_logits[idx].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + except fluid.core.EOFException: + test_pyreader.reset() + break + + is_print = True + if dev_count > 1: + is_print = False + write_result(output_dir, eval_phase, gpu_id, all_results) + if gpu_id == 0: + is_print = True + all_results = concat_result(output_dir, eval_phase, dev_count, RawResult) + + if is_print: + write_predictions(examples, features, all_results, + args.n_best_size, args.max_answer_length, + args.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, tokenizer, version_2_with_negative) + + if eval_phase.find("dev") != -1: + data_file = args.dev_set + elif eval_phase.find("test") != -1: + data_file = args.test_set + + if version_2_with_negative: + from utils.evaluate_v2 import eval_file + eval_out = eval_file(data_file, output_prediction_file, output_null_log_odds_file) + print(eval_out) + + em, f1 = eval_out["exact"], eval_out["f1"] + print("em: %f, f1: %f, best f1: %f" + % (em, f1, eval_out["best_f1"])) + + write_predictions(examples, features, all_results, + args.n_best_size, args.max_answer_length, + args.do_lower_case, output_prediction_file+"_1", + output_nbest_file+"_1", output_null_log_odds_file+"_1", tokenizer, version_2_with_negative, null_score_diff_threshold=eval_out['best_f1_thresh']) + eval_out = eval_file(data_file, output_prediction_file+"_1", output_null_log_odds_file+"_1") + print(eval_out) + em, f1 = eval_out["exact"], eval_out["f1"] + subprocess.getstatusoutput("rm " + output_dir + "/*") + else: + from utils.evaluate_v1 import eval_file + em, f1 = eval_file(data_file, output_prediction_file) + + time_end = time.time() + elapsed_time = time_end - time_begin + + print("[%s evaluation] em: %f, f1: %f, elapsed time: %f" + % (eval_phase, em, f1, elapsed_time)) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, tokenizer, version_2_with_negative=True, + null_score_diff_threshold=0.0): + + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + print("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", [ + "feature_index", "start_index", "end_index", "start_logit", + "end_logit" + ]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + if version_2_with_negative: + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[ + 0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + + + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1 + )] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + + 1)] + #tok_text = tokenizer.encoder.decode(map(int, tok_tokens)) + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + #tok_text = post_process(tok_text, tok_tokens, tokenizer) + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + # if we didn't inlude the empty option in the n-best, inlcude it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction( + text="empty", start_logit=0.0, end_logit=0.0)) + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry and version_2_with_negative: + if entry.text: + best_non_null_entry = entry + # debug + if best_non_null_entry is None and version_2_with_negative: + print("Emmm..., sth wrong") + + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + try: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + except: + scores_diff_json[example.qas_id] = 0 + all_predictions[example.qas_id] = "" + + + + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + +def post_process(text, tok_tokens, tokenizer): + prunc_pair = [[u"[", u"]"], [u"(", u")"], [u"{", u"}"]] + prunc_pair_dic = {u"[":u"]", u"(":u")", u"{":u"}", u'"':'"', u"'":"'"} + prunc_pair_flat = sum(prunc_pair, []) + + prunc = [u".", u",", u"%", u"-", u"!", u"?", u"~", u":", u";",u'"', u"'", u"#", u"$", + u"&", u"*", u"/", u"<", u">", u"=", u"\\", u"+", u"_", u"^", u"|"] + prunc_pair_flat + last_text = tokenizer.encoder.decode(map(int, [tok_tokens[-1]])) + _last_text = tokenizer.encoder.decode(map(int, tok_tokens[:-1])) + final_text = [] + start = -1 + for i,c in enumerate(last_text): + if c in prunc and start == -1: + start = i + else: + final_text.append(c) + + if c in prunc_pair_dic.keys() and prunc_pair_dic[c] in _last_text: + final_text.append(c) + elif c in prunc and i==start: + final_text.append(c) + return _last_text + "".join(final_text) + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + #tok_text = orig_text + + start_position = tok_text.find(pred_text) + if start_position == -1: + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted( + enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs diff --git a/ernie-gram/finetune/sequence_label.py b/ernie-gram/finetune/sequence_label.py new file mode 100644 index 0000000..0801014 --- /dev/null +++ b/ernie-gram/finetune/sequence_label.py @@ -0,0 +1,249 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +import numpy as np +import paddle +import paddle.fluid as fluid + +from six.moves import xrange + +from model.ernie import ErnieModel + + +def create_model(args, pyreader_name, ernie_config, is_prediction=False): + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], + dtypes=['int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], + lod_levels=[0, 0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + + (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, + seq_lens) = fluid.layers.read_file(pyreader) + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=ernie_config, + use_fp16=args.use_fp16) + + enc_out = ernie.get_sequence_output() + logits = fluid.layers.fc( + input=enc_out, + size=args.num_labels, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_seq_label_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_seq_label_out_b", + initializer=fluid.initializer.Constant(0.))) + + ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1]) + ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax( + logits, axis=2), shape=[-1, 1]) + + labels = fluid.layers.flatten(labels, axis=2) + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=fluid.layers.flatten( + logits, axis=2), + label=labels, + return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + + graph_vars = { + "loss": loss, + "probs": probs, + "labels": ret_labels, + "infers": ret_infers, + "seq_lens": seq_lens + } + + for k, v in graph_vars.items(): + v.persistable = True + + return pyreader, graph_vars + + +def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1): + def extract_bio_chunk(seq): + chunks = [] + cur_chunk = None + null_index = tag_num - 1 + for index in xrange(len(seq)): + tag = seq[index] + tag_type = tag // 2 + tag_pos = tag % 2 + + if tag == null_index: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = None + continue + + if tag_pos == 0: + if cur_chunk is not None: + chunks.append(cur_chunk) + cur_chunk = {} + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + else: + if cur_chunk is None: + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + continue + + if cur_chunk["type"] == tag_type: + cur_chunk["en"] = index + 1 + else: + chunks.append(cur_chunk) + cur_chunk = {"st": index, "en": index + 1, "type": tag_type} + + if cur_chunk is not None: + chunks.append(cur_chunk) + return chunks + + null_index = tag_num - 1 + num_label = 0 + num_infer = 0 + num_correct = 0 + labels = np_labels.reshape([-1]).astype(np.int32).tolist() + infers = np_infers.reshape([-1]).astype(np.int32).tolist() + all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist() + + base_index = 0 + for dev_index in xrange(dev_count): + lens = all_lens[dev_index] + max_len = 0 + for l in lens: + max_len = max(max_len, l) + + for i in xrange(len(lens)): + seq_st = base_index + i * max_len + 1 + seq_en = seq_st + (lens[i] - 2) + infer_chunks = extract_bio_chunk(infers[seq_st:seq_en]) + label_chunks = extract_bio_chunk(labels[seq_st:seq_en]) + num_infer += len(infer_chunks) + num_label += len(label_chunks) + + infer_index = 0 + label_index = 0 + while label_index < len(label_chunks) \ + and infer_index < len(infer_chunks): + if infer_chunks[infer_index]["st"] \ + < label_chunks[label_index]["st"]: + infer_index += 1 + elif infer_chunks[infer_index]["st"] \ + > label_chunks[label_index]["st"]: + label_index += 1 + else: + if infer_chunks[infer_index]["en"] \ + == label_chunks[label_index]["en"] \ + and infer_chunks[infer_index]["type"] \ + == label_chunks[label_index]["type"]: + num_correct += 1 + + infer_index += 1 + label_index += 1 + + base_index += max_len * len(lens) + + return num_label, num_infer, num_correct + + +def calculate_f1(num_label, num_infer, num_correct): + if num_infer == 0: + precision = 0.0 + else: + precision = num_correct * 1.0 / num_infer + + if num_label == 0: + recall = 0.0 + else: + recall = num_correct * 1.0 / num_label + + if num_correct == 0: + f1 = 0.0 + else: + f1 = 2 * precision * recall / (precision + recall) + return precision, recall, f1 + + +def evaluate(exe, + program, + pyreader, + graph_vars, + tag_num, + eval_phase, + dev_count=1): + fetch_list = [ + graph_vars["labels"].name, graph_vars["infers"].name, + graph_vars["seq_lens"].name + ] + + if eval_phase == "train": + fetch_list.append(graph_vars["loss"].name) + if "learning_rate" in graph_vars: + fetch_list.append(graph_vars["learning_rate"].name) + outputs = exe.run(fetch_list=fetch_list) + np_labels, np_infers, np_lens, np_loss = outputs[:4] + num_label, num_infer, num_correct = chunk_eval( + np_labels, np_infers, np_lens, tag_num, dev_count) + precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct) + outputs = { + "precision": precision, + "recall": recall, + "f1": f1, + "loss": np.mean(np_loss) + } + if "learning_rate" in graph_vars: + outputs["learning_rate"] = float(outputs[4][0]) + return outputs + + else: + total_label, total_infer, total_correct = 0.0, 0.0, 0.0 + time_begin = time.time() + pyreader.start() + while True: + try: + np_labels, np_infers, np_lens = exe.run(program=program, + fetch_list=fetch_list) + label_num, infer_num, correct_num = chunk_eval( + np_labels, np_infers, np_lens, tag_num, dev_count) + total_infer += infer_num + total_label += label_num + total_correct += correct_num + + except fluid.core.EOFException: + pyreader.reset() + break + + precision, recall, f1 = calculate_f1(total_label, total_infer, + total_correct) + time_end = time.time() + + print( + "[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s" + % (eval_phase, f1, precision, recall, time_end - time_begin)) diff --git a/ernie-gram/lanch.py b/ernie-gram/lanch.py new file mode 100644 index 0000000..3ca2f93 --- /dev/null +++ b/ernie-gram/lanch.py @@ -0,0 +1,143 @@ +import sys +import subprocess +import os +import six +import copy +import argparse +import time +import random + +from utils.args import ArgumentGroup, print_arguments, inv_arguments +from finetune.finetune_args import parser as finetuning_parser + + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +multip_g = ArgumentGroup(parser, "multiprocessing", + "start paddle training using multi-processing mode.") +multip_g.add_arg("node_ips", str, None, + "paddle trainer ips") +multip_g.add_arg("node_id", int, None, + "the trainer id of the node for multi-node distributed training.") +multip_g.add_arg("print_config", bool, True, + "print the config of multi-processing mode.") +multip_g.add_arg("current_node_ip", str, None, + "the ip of current node.") +multip_g.add_arg("split_log_path", str, "log", + "log path for each trainer.") +multip_g.add_arg("log_prefix", str, "", + "the prefix name of job log.") +multip_g.add_arg("nproc_per_node", int, 8, + "the number of process to use on each node.") +multip_g.add_arg("training_script", str, None, "the program/script to be lauched " + "in parallel followed by all the arguments", positional_arg=True) +multip_g.add_arg("training_script_args", str, None, + "training script args", positional_arg=True, nargs=argparse.REMAINDER) +grid_g = ArgumentGroup(parser, "grid_search", + "finetuning by grid searching.") +grid_g.add_arg("grid_lr", str, "1e-4", "learning rate.") +grid_g.add_arg("grid_bsz", str, "32", "barch size.") +grid_g.add_arg("grid_epoch", str, "3", "epoch.") + + +def start_procs(args, grid_search_config): + procs = [] + log_fns = [] + + default_env = os.environ.copy() + + node_id = args.node_id + node_ips = [x.strip() for x in args.node_ips.split(',')] + current_ip = args.current_node_ip + num_nodes = len(node_ips) + selected_gpus = list(map(str, range(args.nproc_per_node))) + selected_gpu_num = len(selected_gpus) + + all_trainer_endpoints = "" + for ip in node_ips: + for i in range(args.nproc_per_node): + if all_trainer_endpoints != "": + all_trainer_endpoints += "," + all_trainer_endpoints += "%s:617%d" % (ip, i) + + nranks = num_nodes * args.nproc_per_node + gpus_per_proc = args.nproc_per_node % selected_gpu_num + if gpus_per_proc == 0: + gpus_per_proc = selected_gpu_num / args.nproc_per_node + else: + gpus_per_proc = selected_gpu_num / args.nproc_per_node + 1 + + selected_gpus_per_proc = [selected_gpus[i:i + int(gpus_per_proc)] for i in range(0, len(selected_gpus), int(gpus_per_proc))] + + if args.print_config: + print("all_trainer_endpoints: ", all_trainer_endpoints, + ", node_id: ", node_id, + ", current_ip: ", current_ip, + ", num_nodes: ", num_nodes, + ", node_ips: ", node_ips, + ", gpus_per_proc: ", gpus_per_proc, + ", selected_gpus_per_proc: ", selected_gpus_per_proc, + ", nranks: ", nranks) + + current_env = copy.copy(default_env) + procs = [] + cmds = [] + log_fns = [] + for i in range(0, args.nproc_per_node): + trainer_id = node_id * args.nproc_per_node + i + current_env.update({ + "FLAGS_selected_gpus": "%s" % ",".join([str(s) for s in selected_gpus_per_proc[i]]), + "PADDLE_TRAINER_ID" : "%d" % trainer_id, + "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i), + "PADDLE_TRAINERS_NUM": "%d" % nranks, + "PADDLE_TRAINER_ENDPOINTS": all_trainer_endpoints, + "PADDLE_NODES_NUM": "%d" % num_nodes, + "GRID_SEARCH_LR": "%f" % grid_search_config["lr"], + "GRID_SEARCH_EPOCH": "%d" % grid_search_config["epoch"], + "GRID_SEARCH_BSZ": "%d" % grid_search_config["bsz"], + "RANDSEED": "%d" % grid_search_config["random_seed"] + }) + + cmd = [sys.executable, "-u", + args.training_script] + args.training_script_args + cmds.append(cmd) + + if args.split_log_path: + fn = open("%s/%sjob.log.%d" % (args.split_log_path, args.log_prefix, trainer_id), "a") + log_fns.append(fn) + process = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) + else: + process = subprocess.Popen(cmd, env=current_env) + procs.append(process) + + for i in range(len(procs)): + proc = procs[i] + proc.wait() + if len(log_fns) > 0: + log_fns[i].close() + if proc.returncode != 0: + raise subprocess.CalledProcessError(returncode=procs[i].returncode, + cmd=cmds[i]) + else: + print("proc %d finsh" % i) + + +def main(lanch_args): + if lanch_args.print_config: + print_arguments(lanch_args) + grid_lr = list(map(float, lanch_args.grid_lr.split(","))) + grid_bsz = list(map(int, lanch_args.grid_bsz.split(","))) + grid_epoch = list(map(int, lanch_args.grid_epoch.split(","))) + for bsz in grid_bsz: + for epoch in grid_epoch: + for lr in grid_lr: + lanch_args.log_prefix = ".".join([str(bsz), str(epoch), str(lr), ""]) + grid_search_config = {"bsz":bsz, "lr":lr, "epoch":epoch, "random_seed":random.randint(0, 10000)} + start_procs(lanch_args, grid_search_config) + + +if __name__ == "__main__": + lanch_args = parser.parse_args() + + while True: + main(lanch_args) diff --git a/ernie-gram/model/__init__.py b/ernie-gram/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ernie-gram/model/ernie.py b/ernie-gram/model/ernie.py new file mode 100644 index 0000000..8d73cf6 --- /dev/null +++ b/ernie-gram/model/ernie.py @@ -0,0 +1,210 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ernie model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json + +import six +import paddle.fluid as fluid + +from model.transformer_encoder import rel_pos_encoder, pre_process_layer + + +class ErnieConfig(object): + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path) as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing Ernie model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict.get(key) + + def print_config(self): + print('------- Model Arguments ---------') + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +class ErnieModel(object): + def __init__(self, + src_ids, + position_ids, + sentence_ids, + task_ids, + input_mask, + config, + rel_pos_bin=32, + weight_sharing=True, + use_fp16=False, + has_sent_emb=False, + name=""): + + self._hidden_size = config['hidden_size'] + self._emb_size = config['emb_size'] or self._hidden_size + self._out_emb_size = config['out_emb_size'] or self._emb_size + self._voc_size = config['vocab_size'] + self._rel_pos_bin = rel_pos_bin + self._out_voc_size = config['out_vocab_size'] or self._voc_size + self._n_layer = config['num_hidden_layers'] + self._n_head = config['num_attention_heads'] + self._max_position_seq_len = config['max_position_embeddings'] + self._sent_types = config['sent_type_vocab_size'] + self._task_types = config['task_type_vocab_size'] + self._hidden_act = config['hidden_act'] + self._prepostprocess_dropout = config['hidden_dropout_prob'] + self._attention_dropout = config['attention_probs_dropout_prob'] + self._weight_sharing = weight_sharing + self.has_sent_emb = has_sent_emb + self._model_name = name + self._rel_pos_emb_name = self._model_name + "rel_pos_embedding" + self._word_emb_name = self._model_name + "word_embedding" + self._pos_emb_name = self._model_name + "pos_embedding" + self._sent_emb_name = self._model_name + "sent_embedding" + self._checkpoints = [] + self._input_mask = input_mask + self._emb_dtype = "float32" + + # Initialize all weigths by truncated normal initializer, and all biases + # will be initialized by constant zero by default. + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=config['initializer_range']) + + self._build_model(src_ids, position_ids, sentence_ids, task_ids, input_mask) + + def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, input_mask): + # padding id in vocabulary must be set to 0 + emb_out = fluid.layers.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + + position_emb_out = fluid.layers.embedding( + input=position_ids[0], + size=[self._max_position_seq_len, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._pos_emb_name, initializer=self._param_initializer)) + + rel_position_scaler_emb_out = fluid.layers.embedding( + input=position_ids[1], + size=[self._rel_pos_bin + 1, self._n_head], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._rel_pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.layers.embedding( + sentence_ids, + size=[self._sent_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._sent_emb_name, initializer=self._param_initializer)) + + emb_out = emb_out + position_emb_out + if self.has_sent_emb: + emb_out = emb_out + sent_emb_out + + emb_out = pre_process_layer( + emb_out, 'nd', self._prepostprocess_dropout, name=self._model_name + 'pre_encoder') + + if self._emb_size != self._hidden_size: + emb_out = fluid.layers.fc(input=emb_out, + num_flatten_dims=2, + size=self._hidden_size, + param_attr=fluid.ParamAttr( + name=self._model_name + 'emb_hidden_mapping', + initializer=self._param_initializer), + bias_attr=self._model_name + 'emb_hidden_mapping_bias') + + + self_attn_mask = fluid.layers.matmul( + x=input_mask, y=input_mask, transpose_y=True) + + self_attn_mask = fluid.layers.scale( + x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = fluid.layers.stack( + x=[self_attn_mask] * self._n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + self._enc_out, encoder_checkpoints = rel_pos_encoder( + enc_input=emb_out, + pos_input=rel_position_scaler_emb_out, + attn_bias=n_head_self_attn_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._hidden_size // self._n_head, + d_value=self._hidden_size // self._n_head, + d_model=self._hidden_size, + d_inner_hid=self._hidden_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd="", + postprocess_cmd="dan", + param_initializer=self._param_initializer, + name=self._model_name + 'encoder') + + self._checkpoints.extend(encoder_checkpoints) + + def get_sequence_output(self): + _enc_out = fluid.layers.fc( + input=self._enc_out, + size=128, + num_flatten_dims=2, + act=self._hidden_act, + param_attr=fluid.ParamAttr( + name=self._model_name + 'mask_lm_trans_fc.w_0', + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr(name=self._model_name + 'mask_lm_trans_fc.b_0')) + + return _enc_out + + def get_checkpoints(self): + """return checkpoints for recomputing""" + #recompute checkpoints + return self._checkpoints + + def get_pooled_output(self, has_fc=True): + """Get the first feature of each sequence for classification""" + next_sent_feat = fluid.layers.slice( + input=self._enc_out, axes=[1], starts=[0], ends=[1]) + if has_fc: + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._hidden_size, + act="tanh", + param_attr=fluid.ParamAttr( + name=self._model_name + "pooled_fc.w_0", initializer=self._param_initializer), + bias_attr=self._model_name + "pooled_fc.b_0") + else: + next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, self._hidden_size]) + + return next_sent_feat + diff --git a/ernie-gram/model/optimization.py b/ernie-gram/model/optimization.py new file mode 100644 index 0000000..ac4b827 --- /dev/null +++ b/ernie-gram/model/optimization.py @@ -0,0 +1,171 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimization and learning rate scheduling.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle.fluid import framework +from paddle.fluid.framework import Variable, default_main_program +from paddle.optimizer.lr import LambdaDecay + + +def get_warmup_and_linear_decay(max_steps, warmup_steps): + return lambda step: min(step / warmup_steps, 1. - (step - warmup_steps) / (max_steps - warmup_steps)) if warmup_steps else 1. + + +class AdamW(paddle.optimizer.AdamW): + """AdamW object for dygraph""" + def __init__(self, *args, **kwargs): + layerwise_lr_decay = kwargs.pop('layerwise_lr_decay_rate', 0.8) + n_layers = kwargs.pop('n_layers', 12) + super(AdamW, self).__init__(*args, **kwargs) + self.ld = layerwise_lr_decay + self.n_layers = n_layers + + def _get_layerwise_lr_decay_rate(self, param): + if param.name.startswith("encoder_layer"): + layer = int(param.name.split("_")[2]) + decay_rate = self.ld ** (self.n_layers - layer) + elif "embedding" in param.name: + decay_rate = self.ld ** (self.n_layers + 1) + else: + decay_rate = 1.0 + return decay_rate + + def _create_param_lr(self, param_and_grad): + # create learning rate tensor for every parameter + param = param_and_grad[0] + param_lr = param.optimize_attr['learning_rate'] * self._get_layerwise_lr_decay_rate(param) + if type(param_lr) == Variable: + return param_lr + else: + if param_lr == 1.0: + return self._global_learning_rate() + else: + with default_main_program()._lr_schedule_guard( + is_with_opt=True), framework.name_scope( + 'scale_with_param_lr'): + return self._global_learning_rate() * param_lr + + def _append_decoupled_weight_decay(self, block, param_and_grad): + """ + Add decoupled weight decay op. + parameter = parameter - parameter * coeff * lr + Args: + block: block in which variable is to be created + param_and_grad: (parameters, gradients) pairs, + the parameters need to decay. + Raises: + Exception: The type of coeff and parameter is not consistent. + """ + param, grad = param_and_grad + + if self._apply_decay_param_fun is not None \ + and not self._apply_decay_param_fun(param.name): + return + + learning_rate = self._global_learning_rate() + + with block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + self._params_name.add(param.name) + + # If it has been calculated, the result will be reused. + # NOTE(wangxi): In dygraph mode, apply_gradient will be executed + # every step, so need clear _lr_to_coeff every step, + # we do this in _create_optimization_pass + decay_coeff = self._lr_to_coeff.get(learning_rate, None) + if decay_coeff is None: + decay_coeff = 1.0 - learning_rate * self._coeff + self._lr_to_coeff[learning_rate] = decay_coeff + + find_master = (self._multi_precision and + param.dtype == core.VarDesc.VarType.FP16) + if find_master: + master_weight = self._master_weights[param.name] + scaled_param = master_weight * decay_coeff + paddle.fluid.layers.assign( + input=scaled_param, output=master_weight) + else: + scaled_param = param * decay_coeff + paddle.fluid.layers.assign(input=scaled_param, output=param) + + +def optimization(loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + startup_prog, + weight_decay, + scheduler='linear_warmup_decay', + dist_strategy=None, + use_amp=False, + init_loss_scaling=1.0, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=2, + incr_ratio=2.0, + decr_ratio=0.8, + layer_decay_rate=0.8, + n_layers=12): + + def exclude_from_weight_decay(param): + name = param.rstrip('.master') + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + scheduled_lr = paddle.optimizer.lr.LambdaDecay( + learning_rate, + get_warmup_and_linear_decay(num_train_steps, warmup_steps)) + + optimizer = AdamW( + learning_rate=scheduled_lr, + beta1=0.9, + beta2=0.98, + epsilon=1e-06, + weight_decay=weight_decay, + apply_decay_param_fun=exclude_from_weight_decay, + grad_clip=grad_clip, + layerwise_lr_decay_rate=layer_decay_rate, + n_layers=n_layers) + + loss_scaling = fluid.layers.create_global_var( + name=fluid.unique_name.generate("loss_scaling"), + shape=[1], + value=1.0, + dtype='float32', + persistable=True) + + optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) + + _, param_grads = optimizer.minimize(loss) + + if use_amp: + loss_scaling = train_program.global_block().vars['loss_scaling_1'] + + return scheduled_lr, loss_scaling + + diff --git a/ernie-gram/model/transformer_encoder.py b/ernie-gram/model/transformer_encoder.py new file mode 100644 index 0000000..dde47e9 --- /dev/null +++ b/ernie-gram/model/transformer_encoder.py @@ -0,0 +1,349 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformer encoder.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from functools import partial + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + +def multi_head_attention(queries, + keys, + values, + pos_bias, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + param_initializer=None, + name='multi_head_att'): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_query_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_query_fc.b_0') + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_key_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_key_fc.b_0') + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_value_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_value_fc.b_0') + return q, k, v + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_key**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + if pos_bias: + product += pos_bias + + weights = layers.softmax(product, use_cudnn=True) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + if cache is not None: # use cache and concat time steps + # Since the inplace reshape in __split_heads changes the shape of k and + # v, which is the cache input for next time step, reshape the cache + # input from the previous time step first. + k = cache["k"] = layers.concat( + [layers.reshape( + cache["k"], shape=[0, 0, d_model]), k], axis=1) + v = cache["v"] = layers.concat( + [layers.reshape( + cache["v"], shape=[0, 0, d_model]), v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_output_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_output_fc.b_0') + return proj_out + + +def positionwise_feed_forward(x, + d_inner_hid, + d_hid, + dropout_rate, + hidden_act, + param_initializer=None, + name='ffn'): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act=hidden_act, + param_attr=fluid.ParamAttr( + name=name + '_fc_0.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_0.b_0') + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name=name + '_fc_1.w_0', initializer=param_initializer), + bias_attr=name + '_fc_1.b_0') + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., + name=''): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=name + '_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + return out + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + +def encoder_layer(enc_input, + pos_bias, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention( + pre_process_layer( + enc_input, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_att'), + None, + None, + pos_bias, + attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + param_initializer=param_initializer, + name=name + '_multi_head_att') + attn_output = post_process_layer( + enc_input, + attn_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_att') + ffd_output = positionwise_feed_forward( + pre_process_layer( + attn_output, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_ffn'), + d_inner_hid, + d_model, + relu_dropout, + hidden_act, + param_initializer=param_initializer, + name=name + '_ffn') + return post_process_layer( + attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_ffn'), ffd_output + +def rel_pos_encoder(enc_input, + pos_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + reset=True, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + rel_pos_bias = layers.transpose(x=pos_input, perm=[0, 3, 1, 2]) + attn_bias += rel_pos_bias + pos_bias = None + + _checkpoints = [] + + for i in range(n_layer): + enc_output, cp = encoder_layer( + enc_input, + pos_bias, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + _checkpoints.append(cp.name) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") + + return enc_output, _checkpoints diff --git a/ernie-gram/reader/__init__.py b/ernie-gram/reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ernie-gram/reader/batching.py b/ernie-gram/reader/batching.py new file mode 100644 index 0000000..756f281 --- /dev/null +++ b/ernie-gram/reader/batching.py @@ -0,0 +1,248 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Mask, padding and batching.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import math +from six.moves import xrange + + +def mask(batch_tokens, + seg_labels, + mask_word_tags, + total_token_num, + vocab_size, + CLS=1, + SEP=2, + MASK=3): + """ + Add mask for batch_tokens, return out, mask_label, mask_pos; + Note: mask_pos responding the batch_tokens after padded; + """ + max_len = max([len(sent) for sent in batch_tokens]) + mask_label = [] + mask_pos = [] + prob_mask = np.random.rand(total_token_num) + # Note: the first token is [CLS], so [low=1] + replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) + pre_sent_len = 0 + prob_index = 0 + for sent_index, sent in enumerate(batch_tokens): + mask_flag = False + mask_word = mask_word_tags[sent_index] + prob_index += pre_sent_len + if mask_word: + beg = 0 + for token_index, token in enumerate(sent): + seg_label = seg_labels[sent_index][token_index] + if seg_label == 1: + continue + if beg == 0: + if seg_label != -1: + beg = token_index + continue + + prob = prob_mask[prob_index + beg] + if prob > 0.15: + pass + else: + for index in xrange(beg, token_index): + prob = prob_mask[prob_index + index] + base_prob = 1.0 + if index == beg: + base_prob = 0.15 + if base_prob * 0.2 < prob <= base_prob: + mask_label.append(sent[index]) + sent[index] = MASK + mask_flag = True + mask_pos.append(sent_index * max_len + index) + elif base_prob * 0.1 < prob <= base_prob * 0.2: + mask_label.append(sent[index]) + sent[index] = replace_ids[prob_index + index] + mask_flag = True + mask_pos.append(sent_index * max_len + index) + else: + mask_label.append(sent[index]) + mask_pos.append(sent_index * max_len + index) + + if seg_label == -1: + beg = 0 + else: + beg = token_index + else: + for token_index, token in enumerate(sent): + prob = prob_mask[prob_index + token_index] + if prob > 0.15: + continue + elif 0.03 < prob <= 0.15: + # mask + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = MASK + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + elif 0.015 < prob <= 0.03: + # random replace + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = replace_ids[prob_index + + token_index] + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + else: + # keep the original token + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + mask_pos.append(sent_index * max_len + token_index) + + pre_sent_len = len(sent) + + mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) + mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) + return batch_tokens, mask_label, mask_pos + + +def _get_rel_pos_scaler(seq_len, max_len=128, num_buckets=32, bidirectional=True, reset=True): + #max_len = 520 + pos = np.array(range(seq_len)) + rel_pos = pos[:, None] - pos[None, :] + ret = 0 + n = -rel_pos + if bidirectional: + num_buckets //= 2 + ret += (n < 0).astype('int32') * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets + n = np.abs(n) + else: + n = np.max(n, np.zeros_like(n)) + # now n is in the range [0, inf) + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = n < max_exact + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + val_if_large = max_exact + (np.log(n.astype('float32') / max_exact) / math.log(max_len / max_exact) * (num_buckets - max_exact)).astype('int32') + tmp = np.full_like(val_if_large, num_buckets-1) + val_if_large = np.where(val_if_large < tmp, val_if_large, tmp) + + ret += np.where(is_small, n, val_if_large) + if reset: + num_buckets *= 2 + ret[:, 0] = num_buckets + ret[0, :] = num_buckets // 2 + + return np.array(ret).reshape([seq_len, seq_len, 1]).astype("int64") + + +def prepare_batch_data(insts, + total_token_num, + voc_size=0, + pad_id=None, + cls_id=None, + sep_id=None, + mask_id=None, + return_input_mask=True, + return_max_len=True, + return_num_token=False): + + batch_src_ids = [inst[0] for inst in insts] + batch_sent_ids = [inst[1] for inst in insts] + batch_pos_ids = [inst[2] for inst in insts] + labels = [inst[3] for inst in insts] + labels = np.array(labels).astype("int64").reshape([-1, 1]) + seg_labels = [inst[4] for inst in insts] + mask_word_tags = [inst[5] for inst in insts] + + # First step: do mask without padding + assert mask_id >= 0, "[FATAL] mask_id must >= 0" + out, mask_label, mask_pos = mask( + batch_src_ids, + seg_labels, + mask_word_tags, + total_token_num, + vocab_size=voc_size, + CLS=cls_id, + SEP=sep_id, + MASK=mask_id) + + # Second step: padding + src_id, self_input_mask = pad_batch_data( + out, pad_idx=pad_id, return_input_mask=True) + pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) + sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) + + return_list = [ + src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos, labels + ] + + return return_list + + +def pad_batch_data(insts, + pad_idx=0, + return_pos=False, + return_input_mask=False, + return_max_len=False, + return_num_token=False, + return_seq_lens=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + + inst_data = np.array( + [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + + # position data + if return_pos: + inst_pos = np.array([ + list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) + for inst in insts + ]) + + return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + + if return_input_mask: + # This is used to avoid attention on paddings. + input_mask_data = np.array([[1] * len(inst) + [0] * + (max_len - len(inst)) for inst in insts]) + input_mask_data = np.expand_dims(input_mask_data, axis=-1) + return_list += [input_mask_data.astype("float32")] + + if return_max_len: + return_list += [max_len] + + if return_num_token: + num_token = 0 + for inst in insts: + num_token += len(inst) + return_list += [num_token] + + if return_seq_lens: + seq_lens = np.array([len(inst) for inst in insts]) + return_list += [seq_lens.astype("int64").reshape([-1, 1])] + + return return_list if len(return_list) > 1 else return_list[0] + + +if __name__ == "__main__": + pass diff --git a/ernie-gram/reader/pretraining.py b/ernie-gram/reader/pretraining.py new file mode 100644 index 0000000..e1fe029 --- /dev/null +++ b/ernie-gram/reader/pretraining.py @@ -0,0 +1,389 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from __future__ import division + +import os +import numpy as np +import types +import gzip +import logging +import re +import six +import collections +import tokenization + +import paddle +import paddle.fluid as fluid + +from batching import prepare_batch_data + + +class ErnieDataReader(object): + def __init__(self, + filelist, + vocab_path, + batch_size=4096, + in_tokens=True, + max_seq_len=512, + shuffle_files=True, + random_seed=1, + epoch=100, + voc_size=0, + is_test=False, + generate_neg_sample=False, + hack_old_trainset=False): + + self.vocab = self.load_vocab(vocab_path) + self.filelist = filelist + self.batch_size = batch_size + self.in_tokens = in_tokens + self.random_seed = random_seed + self.shuffle_files = shuffle_files + self.epoch = epoch + self.current_epoch = 0 + self.current_file_index = 0 + self.total_file = 0 + self.current_file = None + self.voc_size = voc_size + self.max_seq_len = max_seq_len + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.mask_id = self.vocab["[MASK]"] + self.input_slots = 5 + self.is_test = is_test + self.generate_neg_sample = generate_neg_sample + + self.trainer_id = 0 + self.trainer_nums = 1 + self.files = open(filelist).readlines() + self.total_file = len(self.files) + + if self.is_test: + self.epoch = 1 + self.shuffle_files = False + + self.global_rng = np.random.RandomState(random_seed) + if self.shuffle_files: + if os.getenv("PADDLE_TRAINER_ID"): + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + if os.getenv("PADDLE_NODES_NUM"): + self.trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM")) + #renew total_file + self.total_file = len(self.files) // self.trainer_nums * self.trainer_nums + + tmp_files = [] + for each in range(epoch): + each_files = [i for i in self.files] + self.global_rng.shuffle(each_files) + tmp_files += each_files + self.files = tmp_files + #renew epochs + self.epoch = len(self.files) // self.total_file * self.total_file + + assert self.total_file > 0, \ + "[Error] data_dir is empty or less than %d" % self.trainer_nums + + if self.in_tokens: + assert self.batch_size > 100, "Current batch size means total token's number, \ + it should not be set to too small number." + + if hack_old_trainset: + self.input_slots = 4 + + def get_progress(self): + """return current progress of traning data + """ + return self.current_epoch, self.current_file_index, self.total_file, self.current_file, self.mask_type + + def parse_line(self, line, max_seq_len=512): + """ parse one line to token_ids, sentence_ids, pos_ids, label + """ + line = line.strip().split(";") + assert len(line) == self.input_slots, \ + "One sample must have %d fields!" % self.input_slots + + if self.input_slots == 4: + (token_ids, sent_ids, pos_ids, label) = line + token_ids = [int(token) for token in token_ids.split(" ")] + sent_ids = [int(token) for token in sent_ids.split(" ")] + pos_ids = [int(token) for token in pos_ids.split(" ")] + #fake seg_labels + seg_labels = [0, ] * len(sent_ids) + id_sent_b = sent_ids[0] + 1 + len_sent_a = sent_ids.index(id_sent_b) + #sent_a, sent_b + seg_labels[0] = seg_labels[len_sent_a - 1] = seg_labels[-1] = -1 + + if self.input_slots == 5: + (token_ids, sent_ids, pos_ids, seg_labels, label) = line + token_ids = [int(token) for token in token_ids.split(" ")] + sent_ids = [int(token) for token in sent_ids.split(" ")] + pos_ids = [int(token) for token in pos_ids.split(" ")] + seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")] + + assert len(token_ids) == len(sent_ids) == len(pos_ids) == len( + seg_labels + ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" + label = int(label) + if len(token_ids) > max_seq_len: + return None + return [token_ids, sent_ids, pos_ids, label, seg_labels] + + def read_file(self, file): + assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file + with gzip.open(file, "rb") as f: + for line in f: + parsed_line = self.parse_line( + line, max_seq_len=self.max_seq_len) + if parsed_line is None: + continue + yield parsed_line + + def convert_to_unicode(self, text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + def load_vocab(self, vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file) + for num, line in enumerate(fin): + items = self.convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + def random_pair_neg_samples(self, pos_samples): + """ randomly generate negtive samples using pos_samples + + Args: + pos_samples: list of positive samples + + Returns: + neg_samples: list of negtive samples + """ + np.random.shuffle(pos_samples) + num_sample = len(pos_samples) + neg_samples = [] + miss_num = 0 + + def split_sent(sample, max_len, sep_id): + token_seq, type_seq, pos_seq, label, seg_labels = sample + sep_index = token_seq.index(sep_id) + left_len = sep_index - 1 + if left_len <= max_len: + return (token_seq[1:sep_index], seg_labels[1:sep_index]) + else: + return [ + token_seq[sep_index + 1:-1], seg_labels[sep_index + 1:-1] + ] + + for i in range(num_sample): + pair_index = (i + 1) % num_sample + left_tokens, left_seg_labels = split_sent( + pos_samples[i], (self.max_seq_len - 3) // 2, self.sep_id) + right_tokens, right_seg_labels = split_sent( + pos_samples[pair_index], + self.max_seq_len - 3 - len(left_tokens), self.sep_id) + + token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \ + right_tokens + [self.sep_id] + if len(token_seq) > self.max_seq_len: + miss_num += 1 + continue + type_seq = [0] * (len(left_tokens) + 2) + [1] * (len(right_tokens) + + 1) + pos_seq = range(len(token_seq)) + seg_label_seq = [-1] + left_seg_labels + [-1] + right_seg_labels + [ + -1 + ] + + assert len(token_seq) == len(type_seq) == len(pos_seq) == len(seg_label_seq), \ + "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True" + neg_samples.append([token_seq, type_seq, pos_seq, 0, seg_label_seq]) + + return neg_samples, miss_num + + def mixin_negtive_samples(self, pos_sample_generator, buffer=1000): + """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples + 2. combine negtive samples and positive samples + + Args: + pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1] + + Returns: + sample: one sample from shuffled positive samples and negtive samples + """ + pos_samples = [] + num_total_miss = 0 + pos_sample_num = 0 + try: + while True: + while len(pos_samples) < buffer: + pos_sample = next(pos_sample_generator) + label = pos_sample[3] + assert label == 1, "positive sample's label must be 1" + pos_samples.append(pos_sample) + pos_sample_num += 1 + + neg_samples, miss_num = self.random_pair_neg_samples( + pos_samples) + num_total_miss += miss_num + samples = pos_samples + neg_samples + pos_samples = [] + np.random.shuffle(samples) + for sample in samples: + yield sample + except StopIteration: + print("stopiteration: reach end of file") + if len(pos_samples) == 1: + yield pos_samples[0] + elif len(pos_samples) == 0: + yield None + else: + neg_samples, miss_num = self.random_pair_neg_samples( + pos_samples) + num_total_miss += miss_num + samples = pos_samples + neg_samples + pos_samples = [] + np.random.shuffle(samples) + for sample in samples: + yield sample + print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" % + (num_total_miss, pos_sample_num * 2, + num_total_miss / (pos_sample_num * 2))) + + def shuffle_samples(self, sample_generator, buffer=1000): + samples = [] + try: + while True: + while len(samples) < buffer: + sample = next(sample_generator) + samples.append(sample) + np.random.shuffle(samples) + for sample in samples: + yield sample + samples = [] + except StopIteration: + print("stopiteration: reach end of file") + if len(samples) == 0: + yield None + else: + np.random.shuffle(samples) + for sample in samples: + yield sample + + def data_generator(self): + """ + data_generator + """ + def wrapper(): + def reader(): + for epoch in range(self.epoch): + self.current_epoch = epoch + 1 + files = self.files + #during training, data are sliced by trainers + if self.shuffle_files: + start = epoch * self.total_file + end = start + self.total_file + files = [file_ for index, file_ in enumerate(self.files[start:end]) \ + if index % self.trainer_nums == self.trainer_id] + + for index, file_ in enumerate(files): + file_, mask_word_prob = file_.strip().split("\t") + mask_word = (np.random.random() < float(mask_word_prob)) + self.current_file_index = (index + 1) * self.trainer_nums + self.current_file = file_ + if mask_word: + self.mask_type = "mask_word" + else: + self.mask_type = "mask_char" + + sample_generator = self.read_file(file_) + if not self.is_test: + if self.generate_neg_sample: + sample_generator = self.mixin_negtive_samples( + sample_generator) + else: + #shuffle buffered sample + sample_generator = self.shuffle_samples( + sample_generator) + + for sample in sample_generator: + if sample is None: + continue + sample.append(mask_word) + yield sample + + def batch_reader(reader, batch_size): + batch, total_token_num, max_len = [], 0, 0 + for parsed_line in reader(): + token_ids, sent_ids, pos_ids, label, seg_labels, mask_word = parsed_line + max_len = max(max_len, len(token_ids)) + if self.in_tokens: + to_append = (len(batch) + 1) * max_len <= batch_size + else: + to_append = len(batch) < batch_size + if to_append: + batch.append(parsed_line) + total_token_num += len(token_ids) + else: + yield batch, total_token_num + batch, total_token_num, max_len = [parsed_line], len( + token_ids), len(token_ids) + + if len(batch) > 0: + yield batch, total_token_num + + for batch_data, total_token_num in batch_reader(reader, + self.batch_size): + yield prepare_batch_data( + batch_data, + total_token_num, + voc_size=self.voc_size, + pad_id=self.pad_id, + cls_id=self.cls_id, + sep_id=self.sep_id, + mask_id=self.mask_id, + return_input_mask=True, + return_max_len=False, + return_num_token=False) + + return wrapper + + +if __name__ == "__main__": + pass diff --git a/ernie-gram/reader/task_reader.py b/ernie-gram/reader/task_reader.py new file mode 100644 index 0000000..9f050ca --- /dev/null +++ b/ernie-gram/reader/task_reader.py @@ -0,0 +1,1028 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import six +import csv +import json +import numpy as np +from collections import namedtuple + +import reader.tokenization as tokenization +from reader.batching import pad_batch_data, _get_rel_pos_scaler + +class BaseReader(object): + def __init__(self, + vocab_path, + label_map_config=None, + max_seq_len=512, + do_lower_case=True, + in_tokens=False, + tokenizer="FullTokenizer", + is_classify=True, + is_regression=False, + eval_span=False): + self.max_seq_len = max_seq_len + self.tokenizer = getattr(tokenization, tokenizer)( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.rel_pos = _get_rel_pos_scaler(512) + + self.in_tokens = in_tokens + self.is_classify = is_classify + self.is_regression = is_regression + self.eval_span = eval_span + + + self.random_seed = int(os.getenv("RANDSEED")) + print("reader", self.random_seed) + self.global_rng = np.random.RandomState(self.random_seed) + + self.trainer_id = 0 + self.trainer_nums = 1 + if os.getenv("PADDLE_TRAINER_ID"): + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + if os.getenv("PADDLE_NODES_NUM"): + self.trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM")) + + self.current_example = 0 + self.current_epoch = 0 + self.num_examples = 0 + + if label_map_config: + with open(label_map_config) as f: + self.label_map = json.load(f) + else: + self.label_map = None + + def get_train_progress(self): + """Gets progress for training phase.""" + return self.current_example, self.current_epoch + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + headers = next(reader) + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + example = Example(*line) + examples.append(example) + return examples + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + """Converts a single `Example` into a single `Record`.""" + + #text_a = tokenization.convert_to_unicode(example.text_a) + text_a = example.text_a.decode('utf8') if six.PY2 else example.text_a + tokens_a = tokenizer.tokenize(text_a) + if len(tokens_a) > 510: + tokens_b = tokens_a[-381:] + tokens_a = tokens_a[:128] + + tokens_b = None + has_text_b = False + if isinstance(example, dict): + has_text_b = "text_b" in example.keys() + else: + has_text_b = "text_b" in example._fields + + if has_text_b: + #text_b = tokenization.convert_to_unicode(example.text_b) + text_b = example.text_b.decode('utf8') if six.PY2 else example.text_b + tokens_b = tokenizer.tokenize(text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + tokens = [] + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + text_type_ids.append(1) + tokens.append("[SEP]") + text_type_ids.append(1) + + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + task_ids = [0] * len(token_ids) + + if self.label_map: + label_id = self.label_map[example.label] + else: + try: + label_id = example.labels + except: + label_id = example.label + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'task_ids', 'label_id', 'qid']) + + qid = None + if "qid" in example._fields: + qid = example.qid + + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + task_ids=task_ids, + label_id=label_id, + qid=qid) + return record + + def _stride(self, text, max_len=510): + spans = [] + index = 0 + + if len(text) > max_len: + spans.append(text[:128] + text[-382:]) + spans.append(text[:max_len]) + spans.append(text[-max_len:]) + else: + spans.append(text) + + return spans + + def _convert_example_to_record_spans(self, example, max_seq_length, tokenizer, qid, max_len=512): + """Converts a single `Example` into a single `Record`.""" + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.label + records = [] + text_a = example.text_a.decode('utf8') if six.PY2 else example.text_a + tokens_a = tokenizer.tokenize(text_a) + spans = self._stride(tokens_a, max_len-2) + for span in spans: + tokens = [] + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in span: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + task_ids = [0] * len(token_ids) + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'task_ids', 'label_id', 'qid']) + + records.append(Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + task_ids=task_ids, + label_id=label_id, + qid=qid)) + return records + + def _prepare_batch_data(self, examples, batch_size, phase=None): + """generate batch records""" + batch_records, max_len = [], 0 + for index, example in enumerate(examples): + if phase == "train": + self.current_example = index + if not self.eval_span or phase == "train": + records = [self._convert_example_to_record(example, self.max_seq_len, + self.tokenizer)] + else: + records = self._convert_example_to_record_spans(example, self.max_seq_len, + self.tokenizer, index) + for record in records: + if isinstance(record.token_ids[0], list): + max_len = max(max_len, max(map(lambda x:len(x), record.token_ids))) + else: + max_len = max(max_len, len(record.token_ids)) + if self.in_tokens: + to_append = (len(batch_records) + 1) * max_len <= batch_size + else: + to_append = len(batch_records) < batch_size + if to_append: + batch_records.append(record) + else: + yield self._pad_batch_records(batch_records) + batch_records, max_len = [record], len(record.token_ids) + + if batch_records: + yield self._pad_batch_records(batch_records) + + def get_num_examples(self, input_file): + examples = self._read_tsv(input_file) + return len(examples) + + def data_generator(self, + input_file, + batch_size, + epoch, + dev_count=1, + shuffle=True, + phase=None): + examples = self._read_tsv(input_file) + + def wrapper(): + all_dev_batches = [] + trainer_id = 0 + for epoch_index in range(epoch): + if phase == "train": + self.current_example = 0 + self.current_epoch = epoch_index + self.random_seed = epoch_index + self.global_rng = np.random.RandomState(self.random_seed) + trainer_id = self.trainer_id + else: + #dev/test + #assert dev_count == 1, "only supports 1 GPU prediction" + trainer_id = self.trainer_id + + current_examples = [ins for ins in examples] + if shuffle: + self.global_rng.shuffle(current_examples) + #if phase == "train" and self.trainer_nums > 1: + # current_examples = [ins for index, ins in enumerate(current_examples) + # if index % self.trainer_nums == self.trainer_id] + for batch_data in self._prepare_batch_data( + current_examples, batch_size, phase=phase): + if len(all_dev_batches) < dev_count: + all_dev_batches.append(batch_data) + if len(all_dev_batches) == dev_count: + #trick: handle batch inconsistency caused by data sharding for each trainer + yield all_dev_batches[trainer_id] + all_dev_batches = [] + if phase != "train": + if trainer_id < len(all_dev_batches): + yield all_dev_batches[trainer_id] + + return wrapper + + +class ClassifyReader(BaseReader): + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + headers = next(reader) + text_indices = [ + index for index, h in enumerate(headers) if h != "label" + ] + label_indices = [ + index for index, h in enumerate(headers) if h == "label" + ] + Example = namedtuple('Example', headers) + + examples = [] + for line in reader: + for index, text in enumerate(line): + if index in text_indices: + line[index] = text #.replace(' ', '') + + example = Example(*line) + examples.append(example) + return examples + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_task_ids = [record.task_ids for record in batch_records] + batch_labels = [record.label_id for record in batch_records] + if self.is_classify: + batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) + elif self.is_regression: + batch_labels = np.array(batch_labels).astype("float32").reshape([-1, 1]) + if batch_records[0].qid: + batch_qids = [record.qid for record in batch_records] + batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1]) + else: + batch_qids = np.array([]).astype("int64").reshape([-1, 1]) + + # padding + padded_token_ids, input_mask = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = pad_batch_data( + batch_task_ids, self.pad_id)#pad_idx=self.pad_id) + if padded_token_ids.shape[1] > 512: + rel_pos_scaler = _get_rel_pos_scaler(padded_token_ids.shape[1]) + else: + rel_pos_scaler = self.rel_pos[:padded_token_ids.shape[1], :padded_token_ids.shape[1], :] + rel_pos_scaler = np.array([rel_pos_scaler for i in range(padded_token_ids.shape[0])]).astype("int64") + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + padded_task_ids, input_mask, batch_labels, batch_qids, rel_pos_scaler + + ] + + return return_list + + +class ClassifyReaderRace(ClassifyReader): + def _convert_example_to_record_race(self, example, max_seq_length, tokenizer): + """Converts a single `Example` into a single `Record`.""" + + #text_a = tokenization.convert_to_unicode(example.text_a) + total_len = 0 + if six.PY3: + text_p = example.text_a + text_q = example.text_b + text_a = example.text_c.split("") + else: + text_p = example.text_a.decode('utf8') + text_q = example.text_b.decode('utf8') + text_a = example.text_c.decode('utf8').split("") + tokens_p = tokenizer.tokenize(text_p) + assert len(text_a) == 4 + tokens_all = [] + position_all = [] + seg_all = [] + task_all = [] + for i in range(4): + if "_" in text_q: + text_qa = text_q.replace("_", text_a[i]) + else: + text_qa = " ".join([text_q, text_a[i]]) + tokens_qa = tokenizer.tokenize(text_qa) + tokens_p = tokens_p[:max_seq_length - len(tokens_qa) - 3] + tokens = [] + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in tokens_qa: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + for token in tokens_p: + tokens.append(token) + text_type_ids.append(1) + tokens.append("[SEP]") + text_type_ids.append(1) + tokens_id = tokenizer.convert_tokens_to_ids(tokens) + tokens_all.append(tokens_id) + position_all.append(list(range(len(tokens_id)))) + task_all.append([0] * len(tokens_id)) + seg_all.append(text_type_ids) + + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.labels + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'task_ids', 'label_id', 'qid']) + + qid = None + if "qid" in example._fields: + qid = example.qid + + record = Record( + token_ids=tokens_all, + text_type_ids=seg_all, + position_ids=position_all, + task_ids=task_all, + label_id=label_id, + qid=qid) + return record + + def _prepare_batch_data(self, examples, batch_size, phase=None): + """generate batch records""" + batch_records, max_len = [], 0 + for index, example in enumerate(examples): + if phase == "train": + self.current_example = index + record = self._convert_example_to_record_race(example, self.max_seq_len, + self.tokenizer) + if isinstance(record.token_ids[0], list): + max_len = max(max_len, max(map(lambda x:len(x), record.token_ids))) + else: + max_len = max(max_len, len(record.token_ids)) + if self.in_tokens: + to_append = (len(batch_records) + 1) * max_len <= batch_size + else: + to_append = len(batch_records) < batch_size + if to_append: + batch_records.append(record) + else: + yield self._pad_batch_records(batch_records) + batch_records, max_len = [record], len(record.token_ids) + if batch_records: + yield self._pad_batch_records(batch_records) + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_task_ids = [record.task_ids for record in batch_records] + batch_labels = [record.label_id for record in batch_records] + label_all = [] + for l in batch_labels: + tmp = [0, 0, 0, 0] + tmp[int(l)] = 1 + label_all.extend(tmp) + batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) + batch_labels_all = np.array(label_all).astype("float32").reshape([-1, 1]) + + if batch_records[0].qid: + batch_qids = [record.qid for record in batch_records] + batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1]) + else: + batch_qids = np.array([]).astype("int64").reshape([-1, 1]) + + # padding + batch_token_ids = sum(batch_token_ids, []) + batch_text_type_ids = sum(batch_text_type_ids, []) + batch_position_ids = sum(batch_position_ids, []) + batch_task_ids = sum(batch_task_ids, []) + padded_token_ids, input_mask = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = pad_batch_data( + batch_task_ids, self.pad_id)#pad_idx=self.pad_id) + if padded_token_ids.shape[1] > 512: + rel_pos_scaler = _get_rel_pos_scaler(padded_token_ids.shape[1]) + else: + rel_pos_scaler = self.rel_pos[:padded_token_ids.shape[1], :padded_token_ids.shape[1], :] + rel_pos_scaler = np.array([rel_pos_scaler for i in range(padded_token_ids.shape[0])]).astype("int64") + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + padded_task_ids, input_mask, batch_labels, batch_qids, rel_pos_scaler, batch_labels_all + + ] + + return return_list + + +class SequenceLabelReader(BaseReader): + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + batch_task_ids = [record.task_ids for record in batch_records] + batch_label_ids = [record.label_ids for record in batch_records] + + # padding + padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( + batch_token_ids, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = pad_batch_data( + batch_task_ids, 0)#pad_idx=self.pad_id) + padded_label_ids = pad_batch_data( + batch_label_ids, pad_idx=len(self.label_map) - 1) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + padded_task_ids, input_mask, padded_label_ids, batch_seq_lens + ] + return return_list + + def _reseg_token_label(self, tokens, labels, tokenizer): + assert len(tokens) == len(labels) + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + assert len(ret_tokens) == len(ret_labels) + return ret_tokens, ret_labels + + def _convert_example_to_record(self, example, max_seq_length, tokenizer): + tokens = tokenization.convert_to_unicode(example.text_a).split(u"") + labels = tokenization.convert_to_unicode(example.label).split(u"") + tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) + + if len(tokens) > max_seq_length - 2: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens + ["[SEP]"] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + text_type_ids = [0] * len(token_ids) + task_ids = [0] * len(token_ids) + no_entity_id = len(self.label_map) - 1 + label_ids = [no_entity_id] + [ + self.label_map[label] for label in labels + ] + [no_entity_id] + + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'task_ids', 'label_ids']) + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + task_ids=task_ids, + label_ids=label_ids) + return record + + +class ExtractEmbeddingReader(BaseReader): + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_task_ids = [record.task_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + + # padding + padded_token_ids, input_mask, seq_lens = pad_batch_data( + batch_token_ids, + pad_idx=self.pad_id, + return_input_mask=True, + return_seq_lens=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = pad_batch_data( + batch_task_ids, pad_idx=0)#self.pad_id) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + padded_task_ids, input_mask, seq_lens + ] + + return return_list + + +class MRCReader(BaseReader): + def __init__(self, + vocab_path, + label_map_config=None, + max_seq_len=512, + do_lower_case=True, + in_tokens=False, + random_seed=None, + tokenizer="FullTokenizer", + is_classify=True, + is_regression=False, + for_cn=True, + task_id=0, + doc_stride=128, + max_query_length=64, + version_2_with_negative=False): + self.max_seq_len = max_seq_len + self.tokenizer = tokenization.FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.in_tokens = in_tokens + self.for_cn = for_cn + self.task_id = task_id + self.doc_stride = doc_stride + self.max_query_length = max_query_length + self.examples = {} + self.features = {} + self.rel_pos = _get_rel_pos_scaler(512) + self.version_2_with_negative = version_2_with_negative + + #if random_seed is not None: + # np.random.seed(random_seed) + + self.trainer_id = 0 + self.trainer_nums = 1 + if os.getenv("PADDLE_TRAINER_ID"): + self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + if os.getenv("PADDLE_NODES_NUM"): + self.trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM")) + + self.current_example = 0 + self.current_epoch = 0 + self.num_examples = 0 + + def is_whitespace(self, c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + def _read_json(self, input_file, is_training): + examples = [] + with open(input_file, "r") as f: + input_data = json.load(f)["data"] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if self.is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + if self.version_2_with_negative: + is_impossible = qa["is_impossible"] + if len(qa["answers"]) != 1 and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer." + ) + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + + answer_length - 1] + actual_text = " ".join(doc_tokens[start_position:( + end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + print("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = 0 + end_position = 0 + orig_answer_text = "" + Example = namedtuple('Example', + ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', + 'start_position', 'end_position', 'is_impossible']) + example = Example( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + return examples + + def _improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer, orig_answer_text): + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + def _check_is_max_context(self, doc_spans, cur_span_index, position): + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, + num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + def _convert_example_to_feature(self, examples, max_seq_length, tokenizer, is_training): + Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index", + "tokens", "token_to_orig_map", "token_is_max_context", + "token_ids", "position_ids", "text_type_ids", + "start_position", "end_position", "is_impossible"]) + features = [] + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + if len(query_tokens) > self.max_query_length: + query_tokens = query_tokens[0:self.max_query_length] + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + #print(orig_to_tok_index, example.start_position) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = self._improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, + tokenizer, example.orig_answer_text) + + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + _DocSpan = namedtuple("DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, self.doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + text_type_ids = [] + tokens.append("[CLS]") + text_type_ids.append(0) + for token in query_tokens: + tokens.append(token) + text_type_ids.append(0) + tokens.append("[SEP]") + text_type_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[ + split_token_index] + + is_max_context = self._check_is_max_context( + doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + text_type_ids.append(1) + tokens.append("[SEP]") + text_type_ids.append(1) + + token_ids = tokenizer.convert_tokens_to_ids(tokens) + position_ids = list(range(len(token_ids))) + start_position = None + end_position = None + if is_training and not example.is_impossible: + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + feature = Feature( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + token_ids=token_ids, + position_ids=position_ids, + text_type_ids=text_type_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + features.append(feature) + + unique_id += 1 + + return features + + def _prepare_batch_data(self, records, batch_size, phase=None): + """generate batch records""" + batch_records, max_len = [], 0 + + for index, record in enumerate(records): + if phase == "train": + self.current_example = index + max_len = max(max_len, len(record.token_ids)) + if self.in_tokens: + to_append = (len(batch_records) + 1) * max_len <= batch_size + else: + to_append = len(batch_records) < batch_size + if to_append: + batch_records.append(record) + else: + yield self._pad_batch_records(batch_records, phase=="train") + batch_records, max_len = [record], len(record.token_ids) + + if batch_records: + yield self._pad_batch_records(batch_records, phase=="train") + + def _pad_batch_records(self, batch_records, is_training): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + if is_training: + batch_start_position = [record.start_position for record in batch_records] + batch_end_position = [record.end_position for record in batch_records] + batch_start_position = np.array(batch_start_position).astype("int64").reshape([-1, 1]) + batch_end_position = np.array(batch_end_position).astype("int64").reshape([-1, 1]) + else: + batch_size = len(batch_token_ids) + batch_start_position = np.zeros(shape=[batch_size, 1], dtype="int64") + batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64") + + batch_unique_ids = [record.unique_id for record in batch_records] + batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape([-1, 1]) + + # padding + padded_token_ids, input_mask = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = np.ones_like(padded_token_ids, dtype="int64") * self.task_id + if padded_token_ids.shape[1] > 512: + rel_pos_scaler = _get_rel_pos_scaler(padded_token_ids.shape[1]) + else: + rel_pos_scaler = self.rel_pos[:padded_token_ids.shape[1], :padded_token_ids.shape[1], :] + rel_pos_scaler = np.array([rel_pos_scaler for i in range(padded_token_ids.shape[0])]).astype("int64") + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, padded_task_ids, + input_mask, batch_start_position, batch_end_position, batch_unique_ids, rel_pos_scaler + ] + + return return_list + + def get_num_examples(self, phase): + return len(self.features[phase]) + + def get_features(self, phase): + return self.features[phase] + + def get_examples(self, phase): + return self.examples[phase] + + def data_generator(self, + input_file, + batch_size, + epoch, + dev_count=1, + shuffle=True, + phase=None): + + examples = self.examples.get(phase, None) + features = self.features.get(phase, None) + if not examples: + examples = self._read_json(input_file, phase=="train") + features = self._convert_example_to_feature(examples, self.max_seq_len, + self.tokenizer, phase=="train") + self.examples[phase] = examples + self.features[phase] = features + + def wrapper(): + #features = self.features.get(phase, None) + all_dev_batches = [] + trainer_id = 0 + for epoch_index in range(epoch): + if phase == "train": + self.current_example = 0 + self.current_epoch = epoch_index + if phase == "train" and shuffle: + self.random_seed = epoch_index + self.global_rng = np.random.RandomState(self.random_seed) + trainer_id = self.trainer_id + self.global_rng.shuffle(features) + if phase != "train": + trainer_id = self.trainer_id + + for batch_data in self._prepare_batch_data( + features, batch_size, phase=phase): + if len(all_dev_batches) < dev_count: + all_dev_batches.append(batch_data) + if len(all_dev_batches) == dev_count: + #for batch in all_dev_batches: + #yield batch + yield all_dev_batches[trainer_id] + all_dev_batches = [] + if phase != "train": + if trainer_id < len(all_dev_batches): + yield all_dev_batches[trainer_id] + + return wrapper + + +if __name__ == '__main__': + data_reader = ClassifyReader( + vocab_path="./package/vocab.txt", + label_map_config="./package/task_data/xnli/label_map.json", + max_seq_len=512, + do_lower_case=True, + in_tokens=True) + train_data_generator = data_reader.data_generator( + input_file="./package/task_data/xnli/train.tsv", + batch_size=8192, + epoch=3, + shuffle=True, + phase="train") + for batch_data in train_data_generator(): + tokens, text_types, postions, tasks, masks, labels, qids = batch_data + print(tokens.tolist()) diff --git a/ernie-gram/reader/tokenization.py b/ernie-gram/reader/tokenization.py new file mode 100644 index 0000000..51954bb --- /dev/null +++ b/ernie-gram/reader/tokenization.py @@ -0,0 +1,506 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six +#import sentencepiece as sp + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file) + for num, line in enumerate(fin): + items = convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"): + output = [] + for token in tokens: + if token in vocab: + output.append(vocab[token]) + else: + output.append(vocab[unk_token]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + #tokens = self.basic_tokenizer.tokenize(text) + #text = " ".join(tokens) + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class CharTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in text.lower().split(" "): + for sub_token in self.tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class SentencepieceTokenizer(object): + """Runs SentencePiece tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.do_lower_case = do_lower_case + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + self.sp_unk_token = "" + self.unk_token = unk_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text.replace("\1", " ")) + tokens = self.tokenizer.EncodeAsPieces(text) + + output_tokens = [] + for token in tokens: + if token == self.sp_unk_token: + token = self.unk_token + + if token in self.vocab: + output_tokens.append(token) + else: + output_tokens.append(self.unk_token) + + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordsegTokenizer(object): + """Runs Wordseg tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", + split_token="\1"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + + self.do_lower_case = do_lower_case + self.unk_token = unk_token + self.split_token = split_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text) + + output_tokens = [] + for token in text.split(self.split_token): + if token in self.vocab: + output_tokens.append(token) + else: + sp_tokens = self.tokenizer.EncodeAsPieces(token) + for sp_token in sp_tokens: + if sp_token in self.vocab: + output_tokens.append(sp_token) + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + +def tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + + def _is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + output = [] + buff = "" + for char in text: + cp = ord(char) + if _is_chinese_char(cp): + if buff != "": + output.append(buff) + buff = "" + output.append(char) + else: + buff += char + + if buff != "": + output.append(buff) + + return output diff --git a/ernie-gram/run.sh b/ernie-gram/run.sh new file mode 100644 index 0000000..03c308a --- /dev/null +++ b/ernie-gram/run.sh @@ -0,0 +1,67 @@ + +#set -ex + +source ./utils/utils.sh +source ./task_conf $1 $2 +export FLAGS_eager_delete_tensor_gb=2.0 +export FLAGS_sync_nccl_allreduce=1 + + +iplist=`hostname -i` +check_iplist + +mkdir -p ./tmpout +mkdir -p ./log +mkdir -p ./data + +distributed_args="--node_ips ${PADDLE_TRAINERS} \ + --node_id ${PADDLE_TRAINER_ID} \ + --current_node_ip ${POD_IP} \ + --nproc_per_node ${gpu_card} \ + --grid_lr ${lr} \ + --grid_bsz ${batch_size} \ + --grid_epoch ${epoch}" + +python -u ./lanch.py ${distributed_args} \ + ./${scripts:-"run_classifier.py"} --use_cuda true \ + --is_distributed true \ + --tokenizer ${TOKENIZER:-"FullTokenizer"} \ + --do_train true \ + --do_val true \ + --do_test ${do_test:="false"} \ + --verbose true \ + --in_tokens false \ + --init_pretraining_params ${init_model:-""} \ + --train_set ${train_set} \ + --dev_set ${dev_set} \ + --test_set ${test_set} \ + --run_file_path ${run_file_path:-""} \ + --vocab_path ${vocab_path} \ + --ernie_config_path ${CONFIG_PATH} \ + --checkpoints ./checkpoints \ + --save_steps 10000000 \ + --weight_decay ${weight_decay} \ + --warmup_proportion ${warmup} \ + --validation_steps 10000000 \ + --max_seq_len ${max_seq_len:-128} \ + --skip_steps 10 \ + --num_iteration_per_drop_scope 1 \ + --num_labels ${num_labels:-2} \ + --use_multi_gpu_test true \ + --metric ${metric:-"simple_accuracy"} \ + --for_race ${for_race:-"false"} \ + --has_fc ${has_fc:-"true"} \ + --is_regression ${is_regression:-"false"} \ + --is_classify ${is_classify:-"true"} \ + --eval_span ${eval_span:-"false"} \ + --version_2 ${version_2:-"false"} \ + --random_seed 1 > log/lanch.log 2>&1 + + + + + + + + + diff --git a/ernie-gram/run_classifier.py b/ernie-gram/run_classifier.py new file mode 100644 index 0000000..c5dfff5 --- /dev/null +++ b/ernie-gram/run_classifier.py @@ -0,0 +1,432 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing + +import paddle.fluid as fluid +import paddle +import reader.task_reader as task_reader +import paddle.distributed.fleet as fleet +import paddle.distributed.fleet.base.role_maker as role_maker +from model.ernie import ErnieConfig +from finetune.classifier import create_model, evaluate_classify, evaluate_regression +from model.optimization import optimization +from utils.args import print_arguments +from utils.init import init_pretraining_params, init_checkpoint +from finetune.finetune_args import parser + +args = parser.parse_args() + +def create_strategy(args): + """ + Create build strategy and exec strategy. + Args: + + Returns: + build_strategy: build strategy + exec_strategy: exec strategy + """ + build_strategy = paddle.static.BuildStrategy() + exec_strategy = paddle.static.ExecutionStrategy() + + build_strategy.enable_addto = True if args.use_fp16 else False + build_strategy.enable_sequential_execution = True + + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = 4 if args.use_fp16 else 2 + exec_strategy.num_iteration_per_drop_scope = max(1000, args.skip_steps) + + return build_strategy, exec_strategy + +def create_distributed_strategy(args, + build_strategy=None, + exec_strategy=None): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + trainer_id = fleet.worker_index() + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = fleet.worker_endpoints() + num_trainers = len(worker_endpoints) + print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" + .format(worker_endpoints, num_trainers, current_endpoint, trainer_id)) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.execution_strategy = exec_strategy \ + if exec_strategy else paddle.static.ExecutionStrategy() + dist_strategy.build_strategy = build_strategy \ + if build_strategy else paddle.static.ExecutionStrategy() + + dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False + + dist_strategy.nccl_comm_num = args.nccl_comm_num + if args.nccl_comm_num > 1: + dist_strategy.sync_nccl_allreduce=False + + if args.use_hierarchical_allreduce \ + and num_trainers > args.hierarchical_allreduce_inter_nranks: + dist_strategy.use_hierarchical_allreduce = \ + args.use_hierarchical_allreduce + dist_strategy.hierarchical_allreduce_inter_nranks = \ + args.hierarchical_allreduce_inter_nranks + + if args.use_fp16: + print("use ammmmmmmmmmmmmmmmp") + dist_strategy.amp = True + #custom_black_list + custom_white_list = ['softmax', 'layer_norm', 'gelu', 'relu'] + dist_strategy.amp_configs = { + 'custom_white_list': custom_white_list, + 'init_loss_scaling': args.init_loss_scaling + } + + if args.use_recompute: + dist_strategy.recompute = True + + return num_trainers, trainer_id, dist_strategy + +def main(args): + args.epoch = int(os.getenv("GRID_SEARCH_EPOCH")) + args.learning_rate = float(os.getenv("GRID_SEARCH_LR")) + args.random_seed = int(os.getenv("RANDSEED")) + args.batch_size = int(os.getenv("GRID_SEARCH_BSZ")) + print("Modified -> bsz: %d, epoch: %d, lr: %5f, randseed: %d"% + (args.batch_size, args.epoch, args.learning_rate, args.random_seed)) + + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + # Initialize the paddle execute enviroment + paddle.enable_static() + + build_strategy, exec_strategy = create_strategy(args) + + node_nums = int(os.getenv("PADDLE_NODES_NUM")) + + trainers_num = 1 + trainer_id = 0 + #num_train_steps = args.num_train_steps + #warmup_steps = args.warmup_steps + trainers_num, trainer_id, dist_strategy = \ + create_distributed_strategy(args, build_strategy, exec_strategy) + + gpu_id = 0 + gpus = fluid.core.get_cuda_device_count() + if args.is_distributed: + gpus = os.getenv("FLAGS_selected_gpus").split(",") + gpu_id = int(gpus[0]) + + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + if not args.for_race: + reader = task_reader.ClassifyReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + tokenizer=args.tokenizer, + is_classify=args.is_classify, + is_regression=args.is_regression, + eval_span=args.eval_span) + else: + reader = task_reader.ClassifyReaderRace( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + tokenizer=args.tokenizer, + is_classify=args.is_classify, + is_regression=args.is_regression) + + if not (args.do_train or args.do_val or args.do_test): + raise ValueError("For args `do_train`, `do_val` and `do_test`, at " + "least one of them must be True.") + + startup_prog = fluid.Program() + if args.random_seed is not None: + startup_prog.random_seed = args.random_seed + if args.predict_batch_size == None: + args.predict_batch_size = args.batch_size + + if args.do_train: + train_data_generator = reader.data_generator( + input_file=args.train_set, + batch_size=args.batch_size, + epoch=args.epoch, + dev_count=trainers_num, + shuffle=True, + phase="train") + + num_train_examples = reader.get_num_examples(args.train_set) + + if args.in_tokens: + max_train_steps = args.epoch * num_train_examples // ( + args.batch_size // args.max_seq_len) // trainers_num + else: + max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num + + warmup_steps = int(max_train_steps * args.warmup_proportion) + print("Device count: %d, gpu_id: %d" % (trainers_num, gpu_id)) + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + print("Num warmup steps: %d" % warmup_steps) + + train_program = fluid.Program() + + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, graph_vars = create_model( + args, + pyreader_name='train_reader', + ernie_config=ernie_config, + is_classify=args.is_classify, + is_regression=args.is_regression, + for_race=args.for_race, + has_fc=args.has_fc) + if args.use_recompute: + dist_strategy.recompute_configs = { + "checkpoints": graph_vars["checkpoints"], + "enable_offload": False, + } + scheduled_lr, loss_scaling = optimization( + loss=graph_vars["loss"], + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + dist_strategy=dist_strategy, + use_amp=args.use_fp16, + init_loss_scaling=args.init_loss_scaling, + incr_every_n_steps=args.incr_every_n_steps, + decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, + incr_ratio=args.incr_ratio, + decr_ratio=args.decr_ratio, + layer_decay_rate=args.layer_wise_decay_rate, + n_layers=ernie_config['num_hidden_layers']) + #graph_vars["learning_rate"] = scheduled_lr + #graph_vars["loss_scaling"] = loss_scaling + + if args.do_val or args.do_test: + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, graph_vars = create_model( + args, + pyreader_name='test_reader', + ernie_config=ernie_config, + is_classify=args.is_classify, + is_regression=args.is_regression, + for_race=args.for_race, + has_fc=args.has_fc) + + test_prog = test_prog.clone(for_test=True) + + exe = fluid.Executor(place) + exe.run(startup_prog) + + if args.do_train: + if args.init_checkpoint and args.init_pretraining_params: + print( + "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " + "both are set! Only arg 'init_checkpoint' is made valid.") + if args.init_checkpoint: + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog) + elif args.init_pretraining_params: + init_pretraining_params( + exe, + args.init_pretraining_params, + main_program=startup_prog) + elif args.do_val or args.do_test: + if not args.init_checkpoint: + raise ValueError("args 'init_checkpoint' should be set if" + "only doing validation or testing!") + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog) + + if args.do_train: + train_exe = exe + train_pyreader.decorate_tensor_provider(train_data_generator) + else: + train_exe = None + + test_exe = exe + test_dev_count = 1 + if args.do_val or args.do_test: + if args.use_multi_gpu_test: + test_dev_count = min(trainers_num, 8) + print("test_dev_count:", test_dev_count) + + if args.do_train: + train_pyreader.start() + steps = 0 + #if warmup_steps > 0: + # graph_vars["learning_rate"] = scheduled_lr + current_epoch = 0 + last_epoch = 0 + time_begin = time.time() + skip_steps = args.skip_steps + while steps < max_train_steps: + try: + steps += 1 #nccl2_num_trainers + + if steps % skip_steps == 0: + if args.is_regression: + outputs = evaluate_regression(train_exe, train_program, train_pyreader, + graph_vars, "train") + else: + outputs = evaluate_classify(train_exe, train_program, train_pyreader, + graph_vars, "train") + + if args.verbose: + verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + ) + #verbose += "learning rate: %f" % ( + # outputs["learning_rate"] + # if warmup_steps > 0 else args.learning_rate) + print(verbose) + + #current_example, current_epoch = reader.get_train_progress() + current_epoch = steps * args.batch_size * trainers_num // num_train_examples + current_example = steps * args.batch_size * trainers_num % num_train_examples + time_end = time.time() + used_time = time_end - time_begin + if args.is_classify: + print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " + "ave acc: %f, speed: %f steps/s lr: %.5f" % + (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], outputs["accuracy"], + args.skip_steps / used_time, scheduled_lr.get_lr())) + else: + print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " + "speed: %f steps/s lr: %.5f" % + (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], args.skip_steps / used_time, scheduled_lr.get_lr())) + time_begin = time.time() + else: + train_exe.run(fetch_list=[], program=train_program) + + scheduled_lr.step() + + if trainer_id == 0: + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, + "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + current_epoch = steps * args.batch_size * trainers_num // num_train_examples + + if trainer_id < 8: + if last_epoch != current_epoch: + # evaluate dev set + if args.do_val: + for ds in args.dev_set.split(','): + test_pyreader.decorate_tensor_provider( + reader.data_generator( + ds, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False)) + if args.is_regression: + evaluate_regression(exe, test_prog, test_pyreader, graph_vars, "dev") + else: + evaluate_classify(exe, test_prog, test_pyreader, graph_vars, + "dev", use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, dev_count=test_dev_count, metric=args.metric, + eval_span=args.eval_span) + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False)) + if args.is_regression: + evaluate_regression(exe, test_prog, test_pyreader, graph_vars, "test") + else: + evaluate_classify(exe, test_prog, test_pyreader, graph_vars, + "test", use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, dev_count=test_dev_count, metric=args.metric, + eval_span=args.eval_span) + + if last_epoch != current_epoch: + last_epoch = current_epoch + + except fluid.core.EOFException: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + train_pyreader.reset() + break + train_pyreader.reset() + + # final eval on dev set + if args.do_val: + for ds in args.dev_set.split(','): + test_pyreader.decorate_tensor_provider( + reader.data_generator( + ds, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False)) + print("Final validation result:") + if args.is_regression: + evaluate_regression(exe, test_prog, test_pyreader, graph_vars, "dev") + else: + evaluate_classify(exe, test_prog, test_pyreader, graph_vars, + "dev", use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, dev_count=test_dev_count, metric=args.metric, + eval_span=args.eval_span) + + # final eval on test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False)) + print("Final test result:") + if args.is_regression: + evaluate_regression(exe, test_prog, test_pyreader, graph_vars, "test") + else: + evaluate_classify(exe, test_prog, test_pyreader, graph_vars, + "test", use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, dev_count=test_dev_count, metric=args.metric, + eval_span=args.eval_span) + +if __name__ == '__main__': + #print_arguments(args) + main(args) diff --git a/ernie-gram/run_mrc.py b/ernie-gram/run_mrc.py new file mode 100644 index 0000000..32e9efc --- /dev/null +++ b/ernie-gram/run_mrc.py @@ -0,0 +1,424 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing + +# NOTE(paddle-dev): All of these flags should be +# set before `import paddle`. Otherwise, it would +# not take any effect. +os.environ['FLAGS_eager_delete_tensor_gb'] = '0' # enable gc + +import paddle.fluid as fluid + +import paddle +import paddle.distributed.fleet as fleet +import paddle.distributed.fleet.base.role_maker as role_maker +import reader.task_reader as task_reader +from model.ernie import ErnieConfig +from finetune.mrc import create_model, evaluate +from model.optimization import optimization +from utils.args import print_arguments +from utils.init import init_pretraining_params, init_checkpoint +from finetune.finetune_args import parser + +args = parser.parse_args() + +def create_strategy(args): + """ + Create build strategy and exec strategy. + Args: + + Returns: + build_strategy: build strategy + exec_strategy: exec strategy + """ + build_strategy = paddle.static.BuildStrategy() + exec_strategy = paddle.static.ExecutionStrategy() + + build_strategy.enable_addto = True if args.use_fp16 else False + build_strategy.enable_sequential_execution = True + + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = 4 if args.use_fp16 else 2 + exec_strategy.num_iteration_per_drop_scope = max(1000, args.skip_steps) + + return build_strategy, exec_strategy + +def create_distributed_strategy(args, + build_strategy=None, + exec_strategy=None): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + trainer_id = fleet.worker_index() + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = fleet.worker_endpoints() + num_trainers = len(worker_endpoints) + print("worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}" + .format(worker_endpoints, num_trainers, current_endpoint, trainer_id)) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.execution_strategy = exec_strategy \ + if exec_strategy else paddle.static.ExecutionStrategy() + dist_strategy.build_strategy = build_strategy \ + if build_strategy else paddle.static.ExecutionStrategy() + + dist_strategy.fuse_all_reduce_ops = True if args.use_fuse else False + + dist_strategy.nccl_comm_num = args.nccl_comm_num + if args.nccl_comm_num > 1: + dist_strategy.sync_nccl_allreduce=False + + if args.use_hierarchical_allreduce \ + and num_trainers > args.hierarchical_allreduce_inter_nranks: + dist_strategy.use_hierarchical_allreduce = \ + args.use_hierarchical_allreduce + dist_strategy.hierarchical_allreduce_inter_nranks = \ + args.hierarchical_allreduce_inter_nranks + + if args.use_fp16: + print("use ammmmmmmmmmmmmmmmp") + dist_strategy.amp = True + #custom_black_list + custom_white_list = ['softmax', 'layer_norm', 'gelu', 'relu'] + dist_strategy.amp_configs = { + 'custom_white_list': custom_white_list, + 'init_loss_scaling': args.init_loss_scaling + } + + if args.use_recompute: + dist_strategy.recompute = True + + return num_trainers, trainer_id, dist_strategy + + +def main(args): + args.epoch = int(os.getenv("GRID_SEARCH_EPOCH")) + args.learning_rate = float(os.getenv("GRID_SEARCH_LR")) + args.random_seed = int(os.getenv("RANDSEED")) + args.batch_size = int(os.getenv("GRID_SEARCH_BSZ")) + print("Modified -> bsz: %d, epoch: %d, lr: %5f, randseed: %d"% + (args.batch_size, args.epoch, args.learning_rate, args.random_seed)) + + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + # Initialize the paddle execute enviroment + paddle.enable_static() + + build_strategy, exec_strategy = create_strategy(args) + + node_nums = int(os.getenv("PADDLE_NODES_NUM")) + + trainers_num = 1 + trainer_id = 0 + #num_train_steps = args.num_train_steps + #warmup_steps = args.warmup_steps + trainers_num, trainer_id, dist_strategy = \ + create_distributed_strategy(args, build_strategy, exec_strategy) + + gpu_id = 0 + gpus = fluid.core.get_cuda_device_count() + if args.is_distributed: + gpus = os.getenv("FLAGS_selected_gpus").split(",") + gpu_id = int(gpus[0]) + + if args.use_cuda: + place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + reader = task_reader.MRCReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + random_seed=args.random_seed, + tokenizer=args.tokenizer, + is_classify=args.is_classify, + is_regression=args.is_regression, + for_cn=args.for_cn, + task_id=args.task_id, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + version_2_with_negative=args.version_2) + + if not (args.do_train or args.do_val or args.do_test): + raise ValueError("For args `do_train`, `do_val` and `do_test`, at " + "least one of them must be True.") + + #if args.do_test: + # assert args.test_save is not None + startup_prog = fluid.Program() + if args.random_seed is not None: + startup_prog.random_seed = args.random_seed + + if args.predict_batch_size == None: + args.predict_batch_size = args.batch_size + if args.do_train: + train_data_generator = reader.data_generator( + input_file=args.train_set, + batch_size=args.batch_size, + epoch=args.epoch, + dev_count=trainers_num, + shuffle=True, + phase="train") + + num_train_examples = reader.get_num_examples("train") + + if args.in_tokens: + max_train_steps = args.epoch * num_train_examples // ( + args.batch_size // args.max_seq_len) // trainers_num + else: + max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num + + warmup_steps = int(max_train_steps * args.warmup_proportion) + print("Device count: %d, gpu_id: %d" % (trainers_num, gpu_id)) + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + print("Num warmup steps: %d" % warmup_steps) + + train_program = fluid.Program() + + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, graph_vars = create_model( + args, + pyreader_name='train_reader', + ernie_config=ernie_config, + is_training=True) + if args.use_recompute: + dist_strategy.recompute_configs = { + "checkpoints": graph_vars["checkpoints"], + "enable_offload": False, + } + + scheduled_lr, loss_scaling = optimization( + loss=graph_vars["loss"], + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + dist_strategy=dist_strategy, + use_amp=args.use_fp16, + layer_decay_rate=args.layer_wise_decay_rate, + n_layers=ernie_config['num_hidden_layers']) + + if args.do_val or args.do_test: + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, test_graph_vars = create_model( + args, + pyreader_name='test_reader', + ernie_config=ernie_config, + is_training=False) + + test_prog = test_prog.clone(for_test=True) + + exe = fluid.Executor(place) + exe.run(startup_prog) + + if args.do_train: + if args.init_checkpoint and args.init_pretraining_params: + print( + "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " + "both are set! Only arg 'init_checkpoint' is made valid.") + if args.init_checkpoint: + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog) + elif args.init_pretraining_params: + init_pretraining_params( + exe, + args.init_pretraining_params, + main_program=startup_prog) + elif args.do_val or args.do_test: + if not args.init_checkpoint: + raise ValueError("args 'init_checkpoint' should be set if" + "only doing validation or testing!") + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog) + + if args.do_train: + + train_exe = exe + train_pyreader.decorate_tensor_provider(train_data_generator) + else: + train_exe = None + + test_exe = exe + test_dev_count = 1 + if args.do_val or args.do_test: + if args.use_multi_gpu_test: + test_dev_count = min(trainers_num, 8) + + if args.do_train: + train_pyreader.start() + steps = 0 + current_epoch = 0 + last_epoch = 0 + time_begin = time.time() + while steps < max_train_steps: + try: + steps += 1 + if steps % args.skip_steps != 0: + train_exe.run(fetch_list=[], program=train_program) + else: + outputs = evaluate(train_exe, train_program, train_pyreader, + graph_vars, "train", version_2_with_negative=args.version_2) + current_epoch = steps * args.batch_size * trainers_num // num_train_examples + current_example = steps * args.batch_size * trainers_num % num_train_examples + time_end = time.time() + used_time = time_end - time_begin + print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " + "speed: %f steps/s lr: %.6f" % + (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], args.skip_steps / used_time, scheduled_lr.get_lr())) + time_begin = time.time() + scheduled_lr.step() + + if trainer_id == 0: + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, + "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + current_epoch = steps * args.batch_size * trainers_num // num_train_examples + + if trainer_id < 8: + if last_epoch != current_epoch: + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False, + phase="dev")) + evaluate( + exe, + test_prog, + test_pyreader, + test_graph_vars, + str(steps) + "_dev", + examples=reader.get_examples("dev"), + features=reader.get_features("dev"), + args=args, + use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, + dev_count=test_dev_count, + tokenizer=reader.tokenizer, + version_2_with_negative=args.version_2) + + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False, + phase="test")) + evaluate( + exe, + test_prog, + test_pyreader, + test_graph_vars, + str(steps) + "_test", + examples=reader.get_examples("test"), + features=reader.get_features("test"), + args=args, + use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, + dev_count=test_dev_count,tokenizer=reader.tokenizer, + version_2_with_negative=args.version_2) + if last_epoch != current_epoch: + last_epoch = current_epoch + + except fluid.core.EOFException: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + train_pyreader.reset() + break + + train_pyreader.reset() + # final eval on dev set + if args.do_val: + print("Final validation result:") + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False, + phase="dev")) + evaluate( + exe, + test_prog, + test_pyreader, + test_graph_vars, + "dev", + examples=reader.get_examples("dev"), + features=reader.get_features("dev"), + args=args, + use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, + dev_count=test_dev_count,tokenizer=reader.tokenizer, + version_2_with_negative=args.version_2) + + # final eval on test set + if args.do_test: + print("Final test result:") + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=test_dev_count, + shuffle=False, + phase="test")) + evaluate( + exe, + test_prog, + test_pyreader, + test_graph_vars, + "test", + examples=reader.get_examples("test"), + features=reader.get_features("test"), + args=args, + use_multi_gpu_test=args.use_multi_gpu_test, + gpu_id=gpu_id, + dev_count=test_dev_count,tokenizer=reader.tokenizer, + version_2_with_negative=args.version_2) + +if __name__ == '__main__': + print_arguments(args) + main(args) diff --git a/ernie-gram/run_sequence_labeling.py b/ernie-gram/run_sequence_labeling.py new file mode 100644 index 0000000..d10bda2 --- /dev/null +++ b/ernie-gram/run_sequence_labeling.py @@ -0,0 +1,337 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification tasks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing + +import paddle.fluid as fluid + +import reader.task_reader as task_reader +from model.ernie import ErnieConfig +from finetune.sequence_label import create_model, evaluate +from optimization import optimization +from utils.args import print_arguments +from utils.init import init_pretraining_params, init_checkpoint +from finetune_args import parser + +args = parser.parse_args() + + +def main(args): + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + + gpu_id = 0 + gpus = fluid.core.get_cuda_device_count() + if args.is_distributed: + gpus = os.getenv("FLAGS_selected_gpus").split(",") + gpu_id = int(gpus[0]) + + if args.use_cuda: + place = fluid.CUDAPlace(gpu_id) + #dev_count = int(os.getenv("PADDLE_TRAINERS_NUM")) if args.is_distributed else gpus + dev_count = len(gpus) if args.is_distributed else gpus + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + reader = task_reader.SequenceLabelReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + tokenizer=args.tokenizer) + + if not (args.do_train or args.do_val or args.do_test): + raise ValueError("For args `do_train`, `do_val` and `do_test`, at " + "least one of them must be True.") + + startup_prog = fluid.Program() + if args.random_seed is not None: + startup_prog.random_seed = args.random_seed + + if args.do_train: + trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM")) + train_data_generator = reader.data_generator( + input_file=args.train_set, + batch_size=args.batch_size, + epoch=args.epoch, + dev_count=trainers_num, + shuffle=True, + phase="train") + + num_train_examples = reader.get_num_examples(args.train_set) + + if args.in_tokens: + max_train_steps = args.epoch * num_train_examples // ( + args.batch_size // args.max_seq_len) // trainers_num + else: + max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num + + warmup_steps = int(max_train_steps * args.warmup_proportion) + print("Device count: %d, gpu_id: %d" % (trainers_num, gpu_id)) + print("Num train examples: %d" % num_train_examples) + print("Max train steps: %d" % max_train_steps) + print("Num warmup steps: %d" % warmup_steps) + + train_program = fluid.Program() + + with fluid.program_guard(train_program, startup_prog): + with fluid.unique_name.guard(): + train_pyreader, graph_vars = create_model( + args, + pyreader_name='train_reader', + ernie_config=ernie_config) + scheduled_lr = optimization( + loss=graph_vars["loss"], + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_program, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + use_fp16=args.use_fp16, + use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, + init_loss_scaling=args.init_loss_scaling, + incr_every_n_steps=args.incr_every_n_steps, + decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, + incr_ratio=args.incr_ratio, + decr_ratio=args.decr_ratio) + """ + fluid.memory_optimize( + input_program=train_program, + skip_opt_set=[ + graph_vars["loss"].name, + graph_vars["labels"].name, + graph_vars["infers"].name, + graph_vars["seq_lens"].name + ]) + """ + + if args.verbose: + if args.in_tokens: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, + batch_size=args.batch_size // args.max_seq_len) + else: + lower_mem, upper_mem, unit = fluid.contrib.memory_usage( + program=train_program, batch_size=args.batch_size) + print("Theoretical memory usage in training: %.3f - %.3f %s" % + (lower_mem, upper_mem, unit)) + + if args.do_val or args.do_test: + test_prog = fluid.Program() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + test_pyreader, graph_vars = create_model( + args, + pyreader_name='test_reader', + ernie_config=ernie_config) + + test_prog = test_prog.clone(for_test=True) + + nccl2_num_trainers = 1 + nccl2_trainer_id = 0 + print("args.is_distributed:", args.is_distributed) + if args.is_distributed: + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env.split(",") + trainers_num = len(worker_endpoints) + + print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ + trainer_id:{}".format(worker_endpoints, trainers_num, + current_endpoint, trainer_id)) + + # prepare nccl2 env. + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile( + trainer_id, + trainers=worker_endpoints_env, + current_endpoint=current_endpoint, + program=train_program, + startup_program=startup_prog) + nccl2_num_trainers = trainers_num + nccl2_trainer_id = trainer_id + + exe = fluid.Executor(place) + exe.run(startup_prog) + + if args.do_train: + if args.init_checkpoint and args.init_pretraining_params: + print( + "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " + "both are set! Only arg 'init_checkpoint' is made valid.") + if args.init_checkpoint: + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.init_pretraining_params: + init_pretraining_params( + exe, + args.init_pretraining_params, + main_program=startup_prog, + use_fp16=args.use_fp16) + elif args.do_val or args.do_test: + if not args.init_checkpoint: + raise ValueError("args 'init_checkpoint' should be set if" + "only doing validation or testing!") + init_checkpoint( + exe, + args.init_checkpoint, + main_program=startup_prog, + use_fp16=args.use_fp16) + + if args.do_train: + exec_strategy = fluid.ExecutionStrategy() + if args.use_fast_executor: + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = dev_count + exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope + + train_exe = fluid.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=graph_vars["loss"].name, + exec_strategy=exec_strategy, + main_program=train_program, + num_trainers=nccl2_num_trainers, + trainer_id=nccl2_trainer_id) + + train_pyreader.decorate_tensor_provider(train_data_generator) + else: + train_exe = None + + test_exe = exe + test_dev_count = 1 + if args.do_val or args.do_test: + if args.use_multi_gpu_test: + test_dev_count = min(trainers_num, 8) + + if args.do_train: + train_pyreader.start() + steps = 0 + #if warmup_steps > 0: + # graph_vars["learning_rate"] = scheduled_lr + + time_begin = time.time() + + skip_steps = args.skip_steps * nccl2_num_trainers + while True: + try: + steps += nccl2_num_trainers + + if steps % skip_steps == 0: + outputs = evaluate(train_exe, train_program, train_pyreader, + graph_vars, args.num_labels, "train", + dev_count) + if args.verbose: + verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + ) + #verbose += "learning rate: %f" % ( + # outputs["learning_rate"] + # if warmup_steps > 0 else args.learning_rate) + print(verbose) + + current_example, current_epoch = reader.get_train_progress() + time_end = time.time() + used_time = time_end - time_begin + print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " + "f1: %f, precision: %f, recall: %f, speed: %f steps/s" + % (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], outputs["f1"], + outputs["precision"], outputs["recall"], + args.skip_steps / used_time)) + time_begin = time.time() + else: + train_exe.run(fetch_list=[]) + + if nccl2_trainer_id == 0: + if steps % args.save_steps == 0: + save_path = os.path.join(args.checkpoints, + "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + + if steps % args.validation_steps == 0: + # evaluate dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + dev_count=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, + args.num_labels, "dev") + # evaluate test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=1, + shuffle=False)) + evaluate(exe, test_prog, test_pyreader, graph_vars, + args.num_labels, "test") + except fluid.core.EOFException: + save_path = os.path.join(args.checkpoints, "step_" + str(steps)) + fluid.io.save_persistables(exe, save_path, train_program) + train_pyreader.reset() + break + + # final eval on dev set + if args.do_val: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.dev_set, + batch_size=args.batch_size, + epoch=1, + dev_count=1, + shuffle=False)) + print("Final validation result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, + "dev") + + # final eval on test set + if args.do_test: + test_pyreader.decorate_tensor_provider( + reader.data_generator( + args.test_set, + batch_size=args.batch_size, + epoch=1, + dev_count=1, + shuffle=False)) + print("Final test result:") + evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, + "test") + + + +if __name__ == '__main__': + print_arguments(args) + main(args) diff --git a/ernie-gram/task_conf b/ernie-gram/task_conf new file mode 100644 index 0000000..d329091 --- /dev/null +++ b/ernie-gram/task_conf @@ -0,0 +1,177 @@ +finetuning_task=$1 +init_model_path=$2 + +finetuning_data_path="./data/"$finetuning_task +CONFIG_PATH=${init_model_path}"/ernie_config.json" +vocab_path=${init_model_path}"/vocab.txt" +init_model=${init_model_path}"/params" +train_set=${finetuning_data_path}/train.tsv +dev_set=${finetuning_data_path}/dev.tsv +test_set=${finetuning_data_path}/test.tsv + + +# task specific config + +if [[ $finetuning_task == "MNLI" ]]; +then + epoch="3" + lr="8e-5,1e-4" + batch_size="16" + warmup=0.1 + weight_decay=0.1 + num_labels=3 + max_seq_len=256 + train_set=${finetuning_data_path}/train.tsv + dev_set=${finetuning_data_path}/m/dev.tsv,${finetuning_data_path}/mm/dev.tsv + test_set=${finetuning_data_path}/m/test.tsv,${finetuning_data_path}/mm/test.tsv + + gpu_card=4 + +elif [[ $finetuning_task == "QNLI" ]];then + epoch="12" + lr="6e-5,8e-5,1e-4" + batch_size="16" + warmup=0.1 + weight_decay=0.01 + gpu_card=4 + +elif [[ $finetuning_task == "QQP" ]];then + epoch="10" + lr="1e-4,1.25e-4,1.5e-4" + batch_size="16" + warmup=0.1 + weight_decay=0.00 + gpu_card=4 + +elif [[ $finetuning_task == "SST-2" ]]; +then + epoch="12" + lr="6e-5,8e-5,1e-4" + batch_size="32" + warmup=0.1 + weight_decay=0.0 + gpu_card=2 + +elif [[ $finetuning_task == "CoLA" ]]; +then + epoch="10,12,15" + lr="3e-5,5e-5,8e-5" + batch_size="16,32" + warmup=0.1 + weight_decay=0.01 + num_labels=2 + metric="matthews_corrcoef" + gpu_card=1 + +elif [[ $finetuning_task == "RTE" ]]; +then + epoch="10,15" + lr="1e-4,1.25e-4,1.5e-4" + batch_size="16,32" + warmup=0.1 + weight_decay=0.1 + gpu_card=1 + +elif [[ $finetuning_task == "MRPC" ]];then + epoch="10,12,15" + lr="1e-4,1.25e-4,1.5e-4" + batch_size="16,32" + warmup=0.1 + weight_decay=0.01 + has_fc="false" + metric="acc_and_f1" + gpu_card=1 + +elif [[ $finetuning_task == "STS-B" ]];then + epoch="10,12,15" + lr="1e-4,1.25e-4,1.5e-4" + batch_size="16,32" + warmup=0.1 + weight_decay=0.1 + num_labels=1 + metric="pearson_and_spearman" + is_regression="true" + is_classify="false" + gpu_card=1 + +elif [[ $finetuning_task == "RACE" ]]; +then + epoch="5" # {all:4, middle:6, high:5} + lr="8e-5,1e-4" # {all:8e-5,1e-4, middle:1e-4,1.25e-4,1.5e-4, high:8e-5,1e-4} + batch_size="4" # {all:4, middle:8, high:4} + level="high" # {all, middle, high} + warmup=0.1 + weight_decay=0.01 # {all:0.01,middle:0.1,high:0.01} + num_labels=4 + for_race="true" + do_test="true" + max_seq_len=512 + train_set=${finetuning_data_path}/train-${level}.tsv + dev_set=${finetuning_data_path}/dev-${level}.tsv + test_set=${finetuning_data_path}/test-${level}.tsv + gpu_card=4 + +elif [[ $finetuning_task == "IMDB" ]];then + epoch="3" + lr="8e-5,1e-4,1.25e-4" + batch_size="8" + warmup=0.1 + weight_decay=0.1 + max_seq_len=512 + num_labels=2 + eval_span="true" + train_set=${finetuning_data_path}/train.csv + dev_set=${finetuning_data_path}/test.csv + test_set=${finetuning_data_path}/test.csv + + gpu_card=4 + +elif [[ $finetuning_task == "AG" ]];then + epoch="3" + lr="8e-5,1e-4,1.25e-4,1.5e-4" + batch_size="8" + warmup=0.1 + weight_decay=0.0 + max_seq_len=512 + num_labels=4 + eval_span="true" + train_set=${finetuning_data_path}/train.csv + dev_set=${finetuning_data_path}/test.csv + test_set=${finetuning_data_path}/test.csv + + gpu_card=4 + +elif [[ $finetuning_task == "SQuADv1" ]]; +then + epoch="2" + lr="2.25e-4,2.5e-4,2.75e-4" + batch_size="12" + warmup=0.1 + weight_decay=0.0 + max_seq_len=384 + scripts="run_mrc.py" + train_set=${finetuning_data_path}/train.json + dev_set=${finetuning_data_path}/dev.json + test_set=${finetuning_data_path}/dev.json + + gpu_card=4 + +elif [[ $finetuning_task == "SQuADv2" ]]; +then + epoch="4" + lr="1.25e-4,1.5e-4" + batch_size="12" + warmup=0.1 + weight_decay=0.0 + max_seq_len=384 + scripts="run_mrc.py" + version_2="true" + train_set=${finetuning_data_path}/train-v2.0.json + dev_set=${finetuning_data_path}/dev-v2.0.json + test_set=${finetuning_data_path}/dev-v2.0.json + + gpu_card=4 + +fi + + diff --git a/ernie-gram/utils/__init__.py b/ernie-gram/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ernie-gram/utils/args.py b/ernie-gram/utils/args.py new file mode 100644 index 0000000..88849d2 --- /dev/null +++ b/ernie-gram/utils/args.py @@ -0,0 +1,68 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Arguments for configuration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import six +import argparse +import logging + +def prepare_logger(logger, debug=False, save_to_file=None): + formatter = logging.Formatter(fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s') + console_hdl = logging.StreamHandler() + console_hdl.setFormatter(formatter) + logger.addHandler(console_hdl) + if save_to_file is not None and not os.path.exists(save_to_file): + file_hdl = logging.FileHandler(save_to_file) + file_hdl.setFormatter(formatter) + logger.addHandler(file_hdl) + logger.setLevel(logging.DEBUG) + logger.propagate = False + +def str2bool(v): + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, positional_arg=False, **kwargs): + prefix = "" if positional_arg else "--" + type = str2bool if type == bool else type + self._group.add_argument( + prefix + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def inv_arguments(args): + args_list = [] + for arg, value in sorted(six.iteritems(vars(args))): + args_list.extend(['--' + str(arg), str(value)]) + return args_list diff --git a/ernie-gram/utils/cmrc2018_eval.py b/ernie-gram/utils/cmrc2018_eval.py new file mode 100644 index 0000000..b158fd1 --- /dev/null +++ b/ernie-gram/utils/cmrc2018_eval.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +''' +Evaluation script for CMRC 2018 +version: v5 +Note: +v5 formatted output, add usage description +v4 fixed segmentation issues +''' +from __future__ import print_function +from collections import Counter, OrderedDict +import string +import re +import argparse +import json +import sys +reload(sys) +sys.setdefaultencoding('utf8') +import nltk +import pdb + +# split Chinese with English +def mixed_segmentation(in_str, rm_punc=False): + in_str = str(in_str).decode('utf-8').lower().strip() + segs_out = [] + temp_str = "" + sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', + ',','。',':','?','!','“','”',';','’','《','》','……','·','、', + '「','」','(',')','-','~','『','』'] + for char in in_str: + if rm_punc and char in sp_char: + continue + if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char: + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + temp_str = "" + segs_out.append(char) + else: + temp_str += char + + #handling last part + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + + return segs_out + + +# remove punctuation +def remove_punctuation(in_str): + in_str = str(in_str).decode('utf-8').lower().strip() + sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', + ',','。',':','?','!','“','”',';','’','《','》','……','·','、', + '「','」','(',')','-','~','『','』'] + out_segs = [] + for char in in_str: + if char in sp_char: + continue + else: + out_segs.append(char) + return ''.join(out_segs) + + +# find longest common string +def find_lcs(s1, s2): + m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)] + mmax = 0 + p = 0 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i+1][j+1] = m[i][j]+1 + if m[i+1][j+1] > mmax: + mmax=m[i+1][j+1] + p=i+1 + return s1[p-mmax:p], mmax + +# +def evaluate(ground_truth_file, prediction_file): + f1 = 0 + em = 0 + total_count = 0 + skip_count = 0 + for instances in ground_truth_file["data"]: + for instance in instances["paragraphs"]: + context_text = instance['context'].strip() + for qas in instance['qas']: + total_count += 1 + query_id = qas['id'].strip() + query_text = qas['question'].strip() + answers = [ans["text"] for ans in qas["answers"]] + + if query_id not in prediction_file: + sys.stderr.write('Unanswered question: {}\n'.format(query_id)) + skip_count += 1 + continue + + prediction = str(prediction_file[query_id]) + f1 += calc_f1_score(answers, prediction) + em += calc_em_score(answers, prediction) + + f1_score = 100.0 * f1 / total_count + em_score = 100.0 * em / total_count + return f1_score, em_score, total_count, skip_count + + +def calc_f1_score(answers, prediction): + f1_scores = [] + for ans in answers: + ans_segs = mixed_segmentation(ans, rm_punc=True) + prediction_segs = mixed_segmentation(prediction, rm_punc=True) + lcs, lcs_len = find_lcs(ans_segs, prediction_segs) + if lcs_len == 0: + f1_scores.append(0) + continue + precision = 1.0*lcs_len/len(prediction_segs) + recall = 1.0*lcs_len/len(ans_segs) + f1 = (2*precision*recall)/(precision+recall) + f1_scores.append(f1) + return max(f1_scores) + + +def calc_em_score(answers, prediction): + em = 0 + for ans in answers: + ans_ = remove_punctuation(ans) + prediction_ = remove_punctuation(prediction) + if ans_ == prediction_: + em = 1 + break + return em + + +def eval_file(dataset_file, prediction_file): + ground_truth_file = json.load(open(dataset_file, 'rb')) + prediction_file = json.load(open(prediction_file, 'rb')) + F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file) + AVG = (EM+F1)*0.5 + return EM, F1, AVG, TOTAL + +if __name__ == '__main__': + EM, F1, AVG, TOTAL = eval_file(sys.argv[1], sys.argv[2]) + print(EM) + print(F1) + print(TOTAL) diff --git a/ernie-gram/utils/evaluate_v1.py b/ernie-gram/utils/evaluate_v1.py new file mode 100644 index 0000000..c49deb7 --- /dev/null +++ b/ernie-gram/utils/evaluate_v1.py @@ -0,0 +1,92 @@ +""" Official evaluation script for v1.1 of the SQuAD dataset. """ +from __future__ import print_function +from collections import Counter +import string +import re +import argparse +import json +import sys + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return (normalize_answer(prediction) == normalize_answer(ground_truth)) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths( + f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return exact_match, f1 + + +def eval_file(dataset_file_, prediction_file_): + + expected_version = '1.1' + with open(dataset_file_) as dataset_file: + dataset_json = json.load(dataset_file) + if (dataset_json['version'] != expected_version): + print('Evaluation expects v-' + expected_version + + ', but got dataset with v-' + dataset_json['version'], + file=sys.stderr) + dataset = dataset_json['data'] + with open(prediction_file_) as prediction_file: + predictions = json.load(prediction_file) + em, f1 = evaluate(dataset, predictions) + return f1, em + diff --git a/ernie-gram/utils/evaluate_v2.py b/ernie-gram/utils/evaluate_v2.py new file mode 100644 index 0000000..69189cf --- /dev/null +++ b/ernie-gram/utils/evaluate_v2.py @@ -0,0 +1,285 @@ +"""Official evaluation script for SQuAD version 2.0. + +In addition to basic functionality, we also compute additional statistics and +plot precision-recall curves if an additional na_prob.json file is provided. +This file is expected to map question ID's to the model's predicted probability +that a question is unanswerable. +""" +import argparse +import collections +import json +import numpy as np +import os +import re +import string +import sys + +OPTS = None + +def parse_args(): + parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') + parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') + parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') + parser.add_argument('--out-file', '-o', metavar='eval.json', + help='Write accuracy metrics to file (default is stdout).') + parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', + help='Model estimates of probability of no answer.') + parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, + help='Predict "" if no-answer probability exceeds this (default = 1.0).') + parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, + help='Save precision-recall curves to directory.') + parser.add_argument('--verbose', '-v', action='store_true') + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + +def make_qid_to_has_ans(dataset): + qid_to_has_ans = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid_to_has_ans[qa['id']] = bool(qa['answers']) + return qid_to_has_ans + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + def white_space_fix(text): + return ' '.join(text.split()) + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + def lower(text): + return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) + +def get_tokens(s): + if not s: return [] + return normalize_answer(s).split() + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + +def get_raw_scores(dataset, preds): + exact_scores = {} + f1_scores = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid = qa['id'] + gold_answers = [a['text'] for a in qa['answers'] + if normalize_answer(a['text'])] + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + if qid not in preds: + print('Missing prediction for %s' % qid) + continue + a_pred = preds[qid] + # Take max over all gold answers + exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) + f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) + return exact_scores, f1_scores + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + +def plot_pr_curve(precisions, recalls, out_image, title): + plt.step(recalls, precisions, color='b', alpha=0.2, where='post') + plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.xlim([0.0, 1.05]) + plt.ylim([0.0, 1.05]) + plt.title(title) + plt.savefig(out_image) + plt.clf() + +def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, + out_image=None, title=None): + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + true_pos = 0.0 + cur_p = 1.0 + cur_r = 0.0 + precisions = [1.0] + recalls = [0.0] + avg_prec = 0.0 + for i, qid in enumerate(qid_list): + if qid_to_has_ans[qid]: + true_pos += scores[qid] + cur_p = true_pos / float(i+1) + cur_r = true_pos / float(num_true_pos) + if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: + # i.e., if we can put a threshold after this point + avg_prec += cur_p * (cur_r - recalls[-1]) + precisions.append(cur_p) + recalls.append(cur_r) + if out_image: + plot_pr_curve(precisions, recalls, out_image, title) + return {'ap': 100.0 * avg_prec} + +def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, + qid_to_has_ans, out_image_dir): + if out_image_dir and not os.path.exists(out_image_dir): + os.makedirs(out_image_dir) + num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) + if num_true_pos == 0: + return + pr_exact = make_precision_recall_eval( + exact_raw, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_exact.png'), + title='Precision-Recall curve for Exact Match score') + pr_f1 = make_precision_recall_eval( + f1_raw, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_f1.png'), + title='Precision-Recall curve for F1 score') + oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} + pr_oracle = make_precision_recall_eval( + oracle_scores, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_oracle.png'), + title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') + merge_eval(main_eval, pr_exact, 'pr_exact') + merge_eval(main_eval, pr_f1, 'pr_f1') + merge_eval(main_eval, pr_oracle, 'pr_oracle') + +def histogram_na_prob(na_probs, qid_list, image_dir, name): + if not qid_list: + return + x = [na_probs[k] for k in qid_list] + weights = np.ones_like(x) / float(len(x)) + plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) + plt.xlabel('Model probability of no-answer') + plt.ylabel('Proportion of dataset') + plt.title('Histogram of no-answer probability: %s' % name) + plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) + plt.clf() + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + +def main(data_file, pred_file, na_prob_file, na_prob_thresh=1.0, out_image_dir=None): + with open(data_file) as f: + dataset_json = json.load(f) + dataset = dataset_json['data'] + with open(pred_file) as f: + preds = json.load(f) + if na_prob_file: + with open(na_prob_file) as f: + na_probs = json.load(f) + else: + na_probs = {k: 0.0 for k in preds} + qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(dataset, preds) + exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, + na_prob_thresh) + out_eval = make_eval_dict(exact_thresh, f1_thresh) + if has_ans_qids: + has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) + merge_eval(out_eval, has_ans_eval, 'HasAns') + if no_ans_qids: + no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) + merge_eval(out_eval, no_ans_eval, 'NoAns') + if na_prob_file: + find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) + if na_prob_file and out_image_dir: + run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, + qid_to_has_ans, out_image_dir) + histogram_na_prob(na_probs, has_ans_qids, out_image_dir, 'hasAns') + histogram_na_prob(na_probs, no_ans_qids, out_image_dir, 'noAns') + ''' + if OPTS.out_file: + with open(OPTS.out_file, 'w') as f: + json.dump(out_eval, f) + else: + print(json.dumps(out_eval, indent=2)) + ''' + return out_eval + + +def eval_file(dataset_file_, prediction_file_, na_prob_file_): + return main(dataset_file_, prediction_file_, na_prob_file_) + + +if __name__ == '__main__': + OPTS = parse_args() + if OPTS.out_image_dir: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + main() + diff --git a/ernie-gram/utils/glue_data_process.sh b/ernie-gram/utils/glue_data_process.sh new file mode 100644 index 0000000..166224c --- /dev/null +++ b/ernie-gram/utils/glue_data_process.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -ex +R_DIR=`dirname $0`; MY_DIR=`cd $R_DIR;pwd`; + +INPUT=$1 + +if [[ ! -d ./data/ ]];then + mkdir ./data/ + fi + + +### CoLA +mkdir -p ./data/CoLA +cat $INPUT/CoLA/train.tsv | awk -F"\t" '{if(NR==1){print "label\ttext_a"} else {print $2"\t"$4}}' > ./data/CoLA/train.tsv +cat $INPUT/CoLA/dev.tsv | awk -F"\t" '{if(NR==1){print "label\ttext_a"} else {print $2"\t"$4}}' > ./data/CoLA/dev.tsv +cat $INPUT/CoLA/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\tlabel"} else {print $0"\t-1"}}' > ./data/CoLA/test.tsv + +### SST-2 +mkdir -p ./data/SST-2 +cat $INPUT/SST-2/train.tsv | awk -F"\t" '{if(NR==1){print "label\ttext_a"} else if(NF==2) {print $2"\t"$1}}' > ./data/SST-2/train.tsv +cat $INPUT/SST-2/dev.tsv | awk -F"\t" '{if(NR==1){print "label\ttext_a"} else if(NF==2) {print $2"\t"$1}}' > ./data/SST-2/dev.tsv +cat $INPUT/SST-2/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\tlabel"} else {print $0"\t-1"}}' > ./data/SST-2/test.tsv + +### MRPC +mkdir -p ./data/MRPC +cat $INPUT/MRPC/train.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else{print $4"\t"$5"\t"$1}}' > ./data/MRPC/train.tsv +cat $INPUT/MRPC/dev.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else{print $4"\t"$5"\t"$1}}' > ./data/MRPC/dev.tsv +cat $INPUT/MRPC/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\ttext_b\tlabel"} else{print $1"\t"$4"\t"$5"\t-1"}}' > ./data/MRPC/test.tsv + +### STS-B +mkdir -p ./data/STS-B +cat $INPUT/STS-B/train.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else{print $8"\t"$9"\t"$10}}' > ./data/STS-B/train.tsv +cat $INPUT/STS-B/dev.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else{print $8"\t"$9"\t"$10}}' > ./data/STS-B/dev.tsv +cat $INPUT/STS-B/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\ttext_b\tlabel"} else{print $1"\t"$8"\t"$9"\t-1"}}' > ./data/STS-B/test.tsv + +### QQP +mkdir -p ./data/QQP +cat $INPUT/QQP/train.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else if($6!="") {print $4"\t"$5"\t"$6}}' > ./data/QQP/train.tsv +cat $INPUT/QQP/dev.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else if($6!="") {print $4"\t"$5"\t"$6}}' > ./data/QQP/dev.tsv +cat $INPUT/QQP/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\ttext_b\tlabel"} else {print $0"\t-1"}}' > ./data/QQP/test.tsv + +### MNLI +mkdir -p ./data/MNLI +cat $INPUT/MNLI/train.tsv | python $MY_DIR/mnli.py > ./data/MNLI/train.tsv + +mkdir -p ./data/MNLI/m +cat $INPUT/MNLI/dev_matched.tsv | python $MY_DIR/mnli.py > ./data/MNLI/m/dev.tsv +cat $INPUT/MNLI/test_matched.tsv | python $MY_DIR/mnli.py > ./data/MNLI/m/test.tsv + +mkdir -p ./data/MNLI/mm +cat $INPUT/MNLI/dev_mismatched.tsv | python $MY_DIR/mnli.py > ./data/MNLI/mm/dev.tsv +cat $INPUT/MNLI/test_mismatched.tsv | python $MY_DIR/mnli.py > ./data/MNLI/mm/test.tsv + +### QNLI +mkdir -p ./data/QNLI +cat $INPUT/QNLI/train.tsv | python $MY_DIR/qnli.py > ./data/QNLI/train.tsv +cat $INPUT/QNLI/dev.tsv | python $MY_DIR/qnli.py > ./data/QNLI/dev.tsv +cat $INPUT/QNLI/test.tsv | python $MY_DIR/qnli.py > ./data/QNLI/test.tsv + +### RTE +mkdir -p ./data/RTE +cat $INPUT/RTE/train.tsv | python $MY_DIR/qnli.py > ./data/RTE/train.tsv +cat $INPUT/RTE/dev.tsv | python $MY_DIR/qnli.py > ./data/RTE/dev.tsv +cat $INPUT/RTE/test.tsv | python $MY_DIR/qnli.py > ./data/RTE/test.tsv + +### WNLI +mkdir -p ./data/WNLI +cat $INPUT/WNLI/train.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else {print $2"\t"$3"\t"$4}}' > ./data/WNLI/train.tsv +cat $INPUT/WNLI/dev.tsv | awk -F"\t" '{if(NR==1){print "text_a\ttext_b\tlabel"} else {print $2"\t"$3"\t"$4}}' > ./data/WNLI/dev.tsv +cat $INPUT/WNLI/test.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\ttext_b\tlabel"} else {print $1"\t"$2"\t"$3"\t-1"}}' > ./data/WNLI/test.tsv + +### Diagnostics +cat $INPUT/diagnostic/diagnostic.tsv | awk -F"\t" '{if(NR==1){print "qid\ttext_a\ttext_b\tlabel"} else {print $0"\t-1"}}' > ./data/MNLI/diagnostic.tsv + + + diff --git a/ernie-gram/utils/init.py b/ernie-gram/utils/init.py new file mode 100644 index 0000000..9be2a65 --- /dev/null +++ b/ernie-gram/utils/init.py @@ -0,0 +1,72 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import six +import ast +import copy +import subprocess + +import numpy as np +import paddle.fluid as fluid + +if six.PY2: + import commands as subprocess + + +def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + if not fluid.io.is_persistable(var): + return False + return os.path.exists(os.path.join(init_checkpoint_path, var.name)) + + if not use_fp16: + retcode, ret = subprocess.getstatusoutput( + 'rename .master "" ' + init_checkpoint_path + '/*.master' + ) + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + print("Load model from {}".format(init_checkpoint_path)) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(pretraining_params_path, var.name)) + + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=existed_params) + print("Load pretraining parameters from {}.".format( + pretraining_params_path)) + diff --git a/ernie-gram/utils/utils.sh b/ernie-gram/utils/utils.sh new file mode 100644 index 0000000..7b37943 --- /dev/null +++ b/ernie-gram/utils/utils.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -xu +function check_iplist() { + + if [ ${iplist:-} ]; then + #paddle envs + export PADDLE_PSERVER_PORT=9184 + export PADDLE_TRAINER_IPS=${iplist} + #export PADDLE_CURRENT_IP=`/sbin/ip a | grep inet | grep global | awk '{print $2}' | sed 's/\/[0-9][0-9].*$//g'` + export PADDLE_CURRENT_IP=`hostname -i` + + iparray=(${iplist//,/ }) + for i in "${!iparray[@]}"; do + if [ ${iparray[$i]} == ${PADDLE_CURRENT_IP} ]; then + export PADDLE_TRAINER_ID=$i + fi + done + + export TRAINING_ROLE=TRAINER + #export PADDLE_PSERVERS=127.0.0.1 + export PADDLE_INIT_TRAINER_COUNT=${#iparray[@]} + export PADDLE_PORT=${PADDLE_PSERVER_PORT} + export PADDLE_TRAINERS=${PADDLE_TRAINER_IPS} + export POD_IP=${PADDLE_CURRENT_IP} + export PADDLE_TRAINERS_NUM=${PADDLE_INIT_TRAINER_COUNT} + export PADDLE_IS_LOCAL=0 + echo "****************************************************" + + #paddle debug envs + export GLOG_v=0 + export GLOG_logtostderr=1 + + #nccl debug envs + export NCCL_DEBUG=INFO + export NCCL_IB_GID_INDEX=3 + fi +} + -- GitLab