diff --git a/ERNIE/README.md b/ERNIE/README.md
index a25f8b73686ebf4f423441c2bf86152cb060dc96..9d48676128525c2b236b10185121bd8165eb26d8 100644
--- a/ERNIE/README.md
+++ b/ERNIE/README.md
@@ -1,4 +1,3 @@
-
## Ernie: **E**nhanced **R**epresentation from k**N**owledge **I**nt**E**gration
*Ernie* 通过建模海量数据中的词、实体及实体关系,学习真实世界的语义知识。相较于 *Bert* 学习局部语言共现的语义表示,*Ernie* 直接对语义知识进行建模,增强了模型语义表示能力。
@@ -14,3 +13,133 @@
此外, *Ernie* 引入了百科、新闻、论坛回帖等多源中文语料进行训练。
我们在多个公开的中文数据集合上进行了效果验证,*Ernie* 模型相较 *Bert*, 取得了更好的效果。
+
+
+
+
+ 数据集
+
|
+ xnli |
+ lcqmc |
+ msra ner |
+ chnsenticorp |
+ nlpcc-dbqa |
+
+ |
+
+ 评估
+
+ 指标
+
+ |
+
+ acc
+
|
+
+ acc
+
|
+
+ f1-score
+
|
+
+ acc
+
+
|
+
+ mrr
+
|
+
+ f1-score
+
|
+
+
+
+ dev
+
|
+
+ test
+
|
+
+ dev
+
|
+
+ test
+
|
+
+ dev
+
|
+
+ test
+
|
+
+ dev
+
|
+
+ test
+
|
+
+ dev
+
|
+
+ test
+
|
+
+ dev
+
|
+
+ test
+
|
+
+
+
+ Bert
+
|
+ 78.1 |
+ 77.2 |
+ 88.8 |
+ 87.0 |
+ 94.0
+
|
+
+ 92.6 |
+ 94.6 |
+ 94.3 |
+ 94.7 |
+ 94.6 |
+ 80.7 |
+ 80.8 |
+
+
+ Ernie
+
|
+ 79.9 (+1.8) |
+ 78.4 (+1.2) |
+ 89.7 (+0.9) |
+ 87.4 (+0.4) |
+ 95.0 (+1.0) |
+ 93.8 (+1.2) |
+ 95.2 (+0.6) |
+ 95.4 (+1.1) |
+ 95.0 (+0.3) |
+ 95.1 (+0.5) |
+ 82.3 (+1.6) |
+ 82.7 (+1.9) |
+
+
+
+#### 数据集介绍
+
+ - **自然语言推断任务** XNLI
+XNLI 由 Facebook 和纽约大学的研究者联合构建,旨在评测模型多语言的句子理解能力。目标是判断两个句子的关系(矛盾、中立、蕴含)。[链接](https://github.com/facebookresearch/XNLI)
+
+ - **语义匹配任务** LCQMC
+LCQMC 是哈尔滨工业大学在自然语言处理国际顶会 COLING2018 构建的问答匹配数据集其目,标是判断两个问题的语义是否相同。[链接](http://aclweb.org/anthology/C18-1166)
+
+ - **命名实体识别任务** MSRA-NER
+MSRA-NER 数据集由微软亚研院发布,其目标是命名实体识别,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名等。[链接](http://sighan.cs.uchicago.edu/bakeoff2005/)
+
+ - **情感分析任务** ChnSentiCorp
+ChnSentiCorp 是中文情感分析数据集,其目标是判断一段话的情感态度。
+
+ - **检索式问答任务** nlpcc-dbqa
+nlpcc-dbqa是由国际自然语言处理和中文计算会议NLPCC于2016年举办的评测任务,其目标是选择能够回答问题的答案。[链接](http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf)
diff --git a/ERNIE/batching.py b/ERNIE/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..618f66206965df45a2646ffae0d35c7bf83fb4e5
--- /dev/null
+++ b/ERNIE/batching.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+ """
+ Add mask for batch_tokens, return out, mask_label, mask_pos;
+ Note: mask_pos responding the batch_tokens after padded;
+ """
+ max_len = max([len(sent) for sent in batch_tokens])
+ mask_label = []
+ mask_pos = []
+ prob_mask = np.random.rand(total_token_num)
+ # Note: the first token is [CLS], so [low=1]
+ replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+ pre_sent_len = 0
+ prob_index = 0
+ for sent_index, sent in enumerate(batch_tokens):
+ mask_flag = False
+ mask_word = mask_word_tags[sent_index]
+ prob_index += pre_sent_len
+ if mask_word:
+ beg = 0
+ for token_index, token in enumerate(sent):
+ seg_label = seg_labels[sent_index][token_index]
+ if seg_label == 1:
+ continue
+ if beg == 0:
+ if seg_label != -1:
+ beg = token_index
+ continue
+
+ prob = prob_mask[prob_index + beg]
+ if prob > 0.15:
+ pass
+ else:
+ for index in xrange(beg, token_index):
+ prob = prob_mask[prob_index + index]
+ base_prob = 1.0
+ if index == beg:
+ base_prob = 0.15
+ if base_prob * 0.2 < prob <= base_prob:
+ mask_label.append(sent[index])
+ sent[index] = MASK
+ mask_flag = True
+ mask_pos.append(sent_index * max_len + index)
+ elif base_prob * 0.1 < prob <= base_prob * 0.2:
+ mask_label.append(sent[index])
+ sent[index] = replace_ids[prob_index + index]
+ mask_flag = True
+ mask_pos.append(sent_index * max_len + index)
+ else:
+ mask_label.append(sent[index])
+ mask_pos.append(sent_index * max_len + index)
+
+ if seg_label == -1:
+ beg = 0
+ else:
+ beg = token_index
+ else:
+ for token_index, token in enumerate(sent):
+ prob = prob_mask[prob_index + token_index]
+ if prob > 0.15:
+ continue
+ elif 0.03 < prob <= 0.15:
+ # mask
+ if token != SEP and token != CLS:
+ mask_label.append(sent[token_index])
+ sent[token_index] = MASK
+ mask_flag = True
+ mask_pos.append(sent_index * max_len + token_index)
+ elif 0.015 < prob <= 0.03:
+ # random replace
+ if token != SEP and token != CLS:
+ mask_label.append(sent[token_index])
+ sent[token_index] = replace_ids[prob_index + token_index]
+ mask_flag = True
+ mask_pos.append(sent_index * max_len + token_index)
+ else:
+ # keep the original token
+ if token != SEP and token != CLS:
+ mask_label.append(sent[token_index])
+ mask_pos.append(sent_index * max_len + token_index)
+
+ pre_sent_len = len(sent)
+
+ mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+ mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+ return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+ total_token_num,
+ voc_size=0,
+ pad_id=None,
+ cls_id=None,
+ sep_id=None,
+ mask_id=None,
+ return_attn_bias=True,
+ return_max_len=True,
+ return_num_token=False):
+
+ batch_src_ids = [inst[0] for inst in insts]
+ batch_sent_ids = [inst[1] for inst in insts]
+ batch_pos_ids = [inst[2] for inst in insts]
+ labels = [inst[3] for inst in insts]
+ labels = np.array(labels).astype("int64").reshape([-1, 1])
+ seg_labels = [inst[4] for inst in insts]
+ mask_word_tags = [inst[5] for inst in insts]
+
+ # First step: do mask without padding
+ assert mask_id >= 0, "[FATAL] mask_id must >= 0"
+ out, mask_label, mask_pos = mask(
+ batch_src_ids,
+ seg_labels,
+ mask_word_tags,
+ total_token_num,
+ vocab_size=voc_size,
+ CLS=cls_id,
+ SEP=sep_id,
+ MASK=mask_id)
+
+ # Second step: padding
+ src_id, next_sent_index, self_attn_bias = pad_batch_data(
+ out, pad_idx=pad_id, return_next_sent_pos=True, return_attn_bias=True)
+ pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id)
+ sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id)
+
+ return_list = [src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, next_sent_index]
+
+ return return_list
+
+
+def pad_batch_data(insts,
+ pad_idx=0,
+ return_pos=False,
+ return_next_sent_pos=False,
+ return_attn_bias=False,
+ return_max_len=False,
+ return_num_token=False):
+ """
+ Pad the instances to the max sequence length in batch, and generate the
+ corresponding position data and attention bias.
+ """
+ return_list = []
+ max_len = max(len(inst) for inst in insts)
+ # Any token included in dict can be used to pad, since the paddings' loss
+ # will be masked out by weights and make no effect on parameter gradients.
+
+ inst_data = np.array(
+ [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+ return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+ # next_sent_pos for extract first token embedding of each sentence
+ if return_next_sent_pos:
+ batch_size = inst_data.shape[0]
+ max_seq_len = inst_data.shape[1]
+ next_sent_index = np.array(
+ range(0, batch_size * max_seq_len, max_seq_len)).astype(
+ "int64").reshape(-1, 1)
+ return_list += [next_sent_index]
+
+ # position data
+ if return_pos:
+ inst_pos = np.array([
+ list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+ for inst in insts
+ ])
+
+ return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+ if return_attn_bias:
+ # This is used to avoid attention on paddings.
+ slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+ (max_len - len(inst)) for inst in insts])
+ slf_attn_bias_data = np.tile(
+ slf_attn_bias_data.reshape([-1, 1, max_len]), [1, max_len, 1])
+ return_list += [slf_attn_bias_data.astype("float32")]
+
+ if return_max_len:
+ return_list += [max_len]
+
+ if return_num_token:
+ num_token = 0
+ for inst in insts:
+ num_token += len(inst)
+ return_list += [num_token]
+
+ return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+ pass
diff --git a/ERNIE/finetune/__init__.py b/ERNIE/finetune/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ERNIE/finetune/classifier.py b/ERNIE/finetune/classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1f1f9d217b3a6eb6ed15f7fab6497b32446132
--- /dev/null
+++ b/ERNIE/finetune/classifier.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model for classifier."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+
+from model.ernie import ErnieModel
+
+
+def create_model(args,
+ pyreader_name,
+ ernie_config,
+ is_prediction=False):
+ pyreader = fluid.layers.py_reader(
+ capacity=50,
+ shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+ [-1, args.max_seq_len, 1],
+ [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], [-1, 1]],
+ dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'],
+ lod_levels=[0, 0, 0, 0, 0, 0, 0],
+ name=pyreader_name,
+ use_double_buffer=True)
+
+ (src_ids, sent_ids, pos_ids, self_attn_mask, labels,
+ next_sent_index, qids) = fluid.layers.read_file(pyreader)
+
+ ernie = ErnieModel(
+ src_ids=src_ids,
+ position_ids=pos_ids,
+ sentence_ids=sent_ids,
+ self_attn_mask=self_attn_mask,
+ config=ernie_config,
+ use_fp16=args.use_fp16)
+
+ cls_feats = ernie.get_pooled_output(next_sent_index)
+ cls_feats = fluid.layers.dropout(
+ x=cls_feats,
+ dropout_prob=0.1,
+ dropout_implementation="upscale_in_train")
+ logits = fluid.layers.fc(
+ input=cls_feats,
+ size=ernie_config["num_labels"],
+ param_attr=fluid.ParamAttr(
+ name="cls_out_w",
+ initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+ bias_attr=fluid.ParamAttr(
+ name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+ if is_prediction:
+ probs = fluid.layers.softmax(logits)
+ feed_targets_name = [
+ src_ids.name, pos_ids.name, sent_ids.name, self_attn_mask.name,
+ next_sent_index.name
+ ]
+ return pyreader, probs, feed_targets_name
+
+ ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+ logits=logits, label=labels, return_softmax=True)
+ loss = fluid.layers.mean(x=ce_loss)
+
+ if args.use_fp16 and args.loss_scaling > 1.0:
+ loss *= args.loss_scaling
+
+ num_seqs = fluid.layers.create_tensor(dtype='int64')
+ accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
+
+ graph_vars = {"loss": loss,
+ "probs": probs,
+ "accuracy": accuracy,
+ "labels": labels,
+ "num_seqs": num_seqs,
+ "qids": qids}
+
+ for k, v in graph_vars.items():
+ v.persistable=True
+
+ return pyreader, graph_vars
+
+def evaluate_mrr(preds):
+ last_qid = None
+ total_mrr = 0.0
+ qnum = 0.0
+ rank = 0.0
+ correct = False
+ for qid, score, label in preds:
+ if qid != last_qid:
+ rank = 0.0
+ qnum += 1
+ correct = False
+ last_qid = qid
+
+ rank += 1
+ if not correct and label != 0:
+ total_mrr += 1.0 / rank
+ correct = True
+
+ return total_mrr / qnum
+
+def evaluate_map(preds):
+ def singe_map(st, en):
+ total_p = 0.0
+ correct_num = 0.0
+ for index in xrange(st, en):
+ if int(preds[index][2]) != 0:
+ correct_num += 1
+ total_p += correct_num / (index - st + 1)
+ if int(correct_num) == 0:
+ return 0.0
+ return total_p / correct_num
+
+ last_qid = None
+ total_map = 0.0
+ qnum = 0.0
+ st = 0
+ for i in xrange(len(preds)):
+ qid = preds[i][0]
+ if qid != last_qid:
+ qnum += 1
+ if last_qid != None:
+ total_map += singe_map(st, i)
+ st = i
+ last_qid = qid
+
+ total_map += singe_map(st, len(preds))
+ return total_map / qnum
+
+def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
+ train_fetch_list = [graph_vars["loss"].name,
+ graph_vars["accuracy"].name,
+ graph_vars["num_seqs"].name
+ ]
+
+ if eval_phase == "train":
+ if "learning_rate" in graph_vars:
+ train_fetch_list.append(graph_vars["learning_rate"].name)
+ outputs = exe.run(fetch_list=train_fetch_list)
+ ret = {"loss":np.mean(outputs[0]), "accuracy":np.mean(outputs[1])}
+ if "learning_rate" in graph_vars:
+ ret["learning_rate"] = float(outputs[4][0])
+ return ret
+
+ test_pyreader.start()
+ total_cost, total_acc, total_num_seqs, total_label_pos_num, total_pred_pos_num, total_correct_num = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+ qids, labels, scores = [], [], []
+ time_begin = time.time()
+
+ fetch_list = [graph_vars["loss"].name,
+ graph_vars["accuracy"].name,
+ graph_vars["probs"].name,
+ graph_vars["labels"].name,
+ graph_vars["num_seqs"].name,
+ graph_vars["qids"].name]
+ while True:
+ try:
+ np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(program=test_program,
+ fetch_list=fetch_list)
+ total_cost += np.sum(np_loss * np_num_seqs)
+ total_acc += np.sum(np_acc * np_num_seqs)
+ total_num_seqs += np.sum(np_num_seqs)
+ labels.extend(np_labels.reshape((-1)).tolist())
+ qids.extend(np_qids.reshape(-1).tolist())
+ scores.extend(np_probs[:,1].reshape(-1).tolist())
+ np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
+ total_label_pos_num += np.sum(np_labels)
+ total_pred_pos_num += np.sum(np_preds)
+ total_correct_num += np.sum(np.dot(np_preds, np_labels))
+ except fluid.core.EOFException:
+ test_pyreader.reset()
+ break
+ time_end = time.time()
+
+ if len(qids) == 0:
+ print("[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" %
+ (eval_phase, total_cost / total_num_seqs,
+ total_acc / total_num_seqs, total_num_seqs, time_end - time_begin))
+ else:
+ r = total_correct_num / total_label_pos_num
+ p = total_correct_num / total_pred_pos_num
+ f = 2 * p * r / (p + r)
+
+ assert len(qids) == len(labels) == len(scores)
+ preds = sorted(zip(qids, scores, labels), key=lambda elem:(elem[0], -elem[1]))
+ mrr = evaluate_mrr(preds)
+ map = evaluate_map(preds)
+
+ print("[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" %
+ (eval_phase, total_cost / total_num_seqs,
+ total_acc / total_num_seqs,
+ mrr, map, p, r, f, total_num_seqs, time_end - time_begin))
diff --git a/ERNIE/finetune/sequence_label.py b/ERNIE/finetune/sequence_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..327c9e56f0d2ae6f10718a88bce440b79b63dc18
--- /dev/null
+++ b/ERNIE/finetune/sequence_label.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+
+import paddle
+import paddle.fluid as fluid
+
+from model.ernie import ErnieModel
+
+def create_model(args,
+ pyreader_name,
+ ernie_config,
+ is_prediction=False):
+ pyreader = fluid.layers.py_reader(
+ capacity=50,
+ shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+ [-1, args.max_seq_len, 1], [-1, args.max_seq_len, args.max_seq_len],
+ [-1, args.max_seq_len, 1], [-1, 1]],
+ dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64'],
+ lod_levels=[0, 0, 0, 0, 0, 0],
+ name=pyreader_name,
+ use_double_buffer=True)
+
+ (src_ids, sent_ids, pos_ids, self_attn_mask, labels,
+ seq_lens) = fluid.layers.read_file(pyreader)
+
+ ernie = ErnieModel(
+ src_ids=src_ids,
+ position_ids=pos_ids,
+ sentence_ids=sent_ids,
+ self_attn_mask=self_attn_mask,
+ config=ernie_config,
+ use_fp16=args.use_fp16)
+
+ enc_out = ernie.get_sequence_output()
+ logits = fluid.layers.fc(
+ input=enc_out,
+ size=args.num_labels,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name="cls_seq_label_out_w",
+ initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+ bias_attr=fluid.ParamAttr(
+ name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.)))
+
+ ret_labels = fluid.layers.reshape(x=labels, shape=[-1,1])
+ ret_infers = fluid.layers.reshape(x=fluid.layers.argmax(logits, axis=2), shape=[-1,1])
+
+ labels = fluid.layers.flatten(labels, axis=2)
+ ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+ logits=fluid.layers.flatten(logits, axis=2),
+ label=labels, return_softmax=True)
+ loss = fluid.layers.mean(x=ce_loss)
+
+ if args.use_fp16 and args.loss_scaling > 1.0:
+ loss *= args.loss_scaling
+
+ graph_vars = {"loss": loss,
+ "probs": probs,
+ "labels": ret_labels,
+ "infers": ret_infers,
+ "seq_lens": seq_lens}
+
+ for k, v in graph_vars.items():
+ v.persistable=True
+
+ return pyreader, graph_vars
+
+def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
+
+ def extract_bio_chunk(seq):
+ chunks = []
+ cur_chunk = None
+ null_index = tag_num - 1
+ for index in xrange(len(seq)):
+ tag = seq[index]
+ tag_type = tag // 2
+ tag_pos = tag % 2
+
+ if tag == null_index:
+ if cur_chunk is not None:
+ chunks.append(cur_chunk)
+ cur_chunk = None
+ continue
+
+ if tag_pos == 0:
+ if cur_chunk is not None:
+ chunks.append(cur_chunk)
+ cur_chunk = {}
+ cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
+
+ else:
+ if cur_chunk is None:
+ cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
+ continue
+
+ if cur_chunk["type"] == tag_type:
+ cur_chunk["en"] = index + 1
+ else:
+ chunks.append(cur_chunk)
+ cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
+
+ if cur_chunk is not None:
+ chunks.append(cur_chunk)
+ return chunks
+
+ null_index = tag_num - 1
+ num_label = 0
+ num_infer = 0
+ num_correct = 0
+ labels = np_labels.reshape([-1]).astype(np.int32).tolist()
+ infers = np_infers.reshape([-1]).astype(np.int32).tolist()
+ all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()
+
+ base_index = 0
+ for dev_index in xrange(dev_count):
+ lens = all_lens[dev_index]
+ max_len = 0
+ for l in lens:
+ max_len = max(max_len, l)
+
+ for i in xrange(len(lens)):
+ seq_st = base_index + i * max_len + 1
+ seq_en = seq_st + (lens[i] - 2)
+ infer_chunks = extract_bio_chunk(infers[seq_st:seq_en])
+ label_chunks = extract_bio_chunk(labels[seq_st:seq_en])
+ num_infer += len(infer_chunks)
+ num_label += len(label_chunks)
+
+ infer_index = 0
+ label_index = 0
+ while label_index < len(label_chunks) and infer_index < len(infer_chunks):
+ if infer_chunks[infer_index]["st"] < label_chunks[label_index]["st"]:
+ infer_index += 1
+ elif infer_chunks[infer_index]["st"] > label_chunks[label_index]["st"]:
+ label_index += 1
+ else:
+ if infer_chunks[infer_index]["en"] == label_chunks[label_index]["en"] and \
+ infer_chunks[infer_index]["type"] == label_chunks[label_index]["type"]:
+ num_correct += 1
+
+ infer_index += 1
+ label_index += 1
+
+ base_index += max_len * len(lens)
+
+ return num_label, num_infer, num_correct
+
+def calculate_f1(num_label, num_infer, num_correct):
+ if num_infer == 0:
+ precision = 0.0
+ else:
+ precision = num_correct * 1.0 / num_infer
+
+ if num_label == 0:
+ recall = 0.0
+ else:
+ recall = num_correct * 1.0 / num_label
+
+ if num_correct == 0:
+ f1 = 0.0
+ else:
+ f1 = 2 * precision * recall / (precision + recall)
+ return precision, recall, f1
+
+def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=1):
+ fetch_list = [graph_vars["labels"].name,
+ graph_vars["infers"].name,
+ graph_vars["seq_lens"].name]
+
+ if eval_phase == "train":
+ fetch_list.append(graph_vars["loss"].name)
+ if "learning_rate" in graph_vars:
+ fetch_list.append(graph_vars["learning_rate"].name)
+ outputs = exe.run(fetch_list=fetch_list)
+ np_labels, np_infers, np_lens, np_loss = outputs[:4]
+ num_label, num_infer, num_correct = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count)
+ precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
+ outputs = {"precision": precision, "recall": recall, "f1": f1, "loss": np.mean(np_loss)}
+ if "learning_rate" in graph_vars:
+ outputs["lr"] = float(outputs[4][0])
+ return outputs
+
+ else:
+ total_label, total_infer, total_correct = 0.0, 0.0, 0.0
+ time_begin = time.time()
+ pyreader.start()
+ while True:
+ try:
+ np_labels, np_infers, np_lens = exe.run(program=program, fetch_list=fetch_list)
+ label_num, infer_num, correct_num = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count)
+ total_infer += infer_num
+ total_label += label_num
+ total_correct += correct_num
+
+ except fluid.core.EOFException:
+ pyreader.reset()
+ break
+
+ precision, recall, f1 = calculate_f1(total_label, total_infer, total_correct)
+ time_end = time.time()
+
+ print("[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s" %
+ (eval_phase, f1, precision, recall, time_end - time_begin))
diff --git a/ERNIE/finetune_args.py b/ERNIE/finetune_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f7d9349b8087f57addfbdf9cf8b1bf48156d65e
--- /dev/null
+++ b/ERNIE/finetune_args.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+
+from utils.args import ArgumentGroup
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("ernie_config_path", str, None, "Path to the json file for ernie model config.")
+model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params", str, None,
+ "Init pre-training params which preforms fine-tuning from. If the "
+ "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
+ "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float, 0.1,
+ "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
+train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling", float, 1.0,
+ "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
+log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_set", str, None, "Path to training data.")
+data_g.add_arg("test_set", str, None, "Path to test data.")
+data_g.add_arg("dev_set", str, None, "Path to validation data.")
+data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
+data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
+data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens", bool, False,
+ "If set, the batch size will be the maximum number of tokens in one batch. "
+ "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case", bool, True,
+ "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("random_seed", int, 0, "Random seed.")
+data_g.add_arg("label_map_config", str, None, "label_map_path.")
+data_g.add_arg("num_labels", int, 2, "label number")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.")
+run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
+run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
+run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
diff --git a/ERNIE/model/__init__.py b/ERNIE/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ERNIE/model/ernie.py b/ERNIE/model/ernie.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d6e112eb97855e933800b57d5eeb560caf2257
--- /dev/null
+++ b/ERNIE/model/ernie.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+import paddle.fluid as fluid
+from model.transformer_encoder import encoder, pre_process_layer
+
+
+class ErnieConfig(object):
+ def __init__(self, config_path):
+ self._config_dict = self._parse(config_path)
+
+ def _parse(self, config_path):
+ try:
+ with open(config_path) as json_file:
+ config_dict = json.load(json_file)
+ except Exception:
+ raise IOError("Error in parsing Ernie model config file '%s'" %
+ config_path)
+ else:
+ return config_dict
+
+ def __getitem__(self, key):
+ return self._config_dict[key]
+
+ def print_config(self):
+ for arg, value in sorted(six.iteritems(self._config_dict)):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+class ErnieModel(object):
+ def __init__(self,
+ src_ids,
+ position_ids,
+ sentence_ids,
+ self_attn_mask,
+ config,
+ weight_sharing=True,
+ use_fp16=False):
+
+ self._emb_size = config['hidden_size']
+ self._n_layer = config['num_hidden_layers']
+ self._n_head = config['num_attention_heads']
+ self._voc_size = config['vocab_size']
+ self._max_position_seq_len = config['max_position_embeddings']
+ self._sent_types = config['type_vocab_size']
+ self._hidden_act = config['hidden_act']
+ self._prepostprocess_dropout = config['hidden_dropout_prob']
+ self._attention_dropout = config['attention_probs_dropout_prob']
+ self._weight_sharing = weight_sharing
+
+ self._word_emb_name = "word_embedding"
+ self._pos_emb_name = "pos_embedding"
+ self._sent_emb_name = "sent_embedding"
+ self._dtype = "float16" if use_fp16 else "float32"
+
+ # Initialize all weigths by truncated normal initializer, and all biases
+ # will be initialized by constant zero by default.
+ self._param_initializer = fluid.initializer.TruncatedNormal(
+ scale=config['initializer_range'])
+
+ self._build_model(src_ids, position_ids, sentence_ids, self_attn_mask)
+
+ def _build_model(self, src_ids, position_ids, sentence_ids, self_attn_mask):
+ # padding id in vocabulary must be set to 0
+ emb_out = fluid.layers.embedding(
+ input=src_ids,
+ size=[self._voc_size, self._emb_size],
+ dtype=self._dtype,
+ param_attr=fluid.ParamAttr(
+ name=self._word_emb_name, initializer=self._param_initializer),
+ is_sparse=False)
+ position_emb_out = fluid.layers.embedding(
+ input=position_ids,
+ size=[self._max_position_seq_len, self._emb_size],
+ dtype=self._dtype,
+ param_attr=fluid.ParamAttr(
+ name=self._pos_emb_name, initializer=self._param_initializer))
+
+ sent_emb_out = fluid.layers.embedding(
+ sentence_ids,
+ size=[self._sent_types, self._emb_size],
+ dtype=self._dtype,
+ param_attr=fluid.ParamAttr(
+ name=self._sent_emb_name, initializer=self._param_initializer))
+
+ emb_out = emb_out + position_emb_out
+ emb_out = emb_out + sent_emb_out
+
+ emb_out = pre_process_layer(
+ emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+
+ if self._dtype is "float16":
+ self_attn_mask = fluid.layers.cast(
+ x=self_attn_mask, dtype=self._dtype)
+
+ n_head_self_attn_mask = fluid.layers.stack(
+ x=[self_attn_mask] * self._n_head, axis=1)
+ n_head_self_attn_mask.stop_gradient = True
+
+ self._enc_out = encoder(
+ enc_input=emb_out,
+ attn_bias=n_head_self_attn_mask,
+ n_layer=self._n_layer,
+ n_head=self._n_head,
+ d_key=self._emb_size // self._n_head,
+ d_value=self._emb_size // self._n_head,
+ d_model=self._emb_size,
+ d_inner_hid=self._emb_size * 4,
+ prepostprocess_dropout=self._prepostprocess_dropout,
+ attention_dropout=self._attention_dropout,
+ relu_dropout=0,
+ hidden_act=self._hidden_act,
+ preprocess_cmd="",
+ postprocess_cmd="dan",
+ param_initializer=self._param_initializer,
+ name='encoder')
+
+ def get_sequence_output(self):
+ return self._enc_out
+
+ def get_pooled_output(self, next_sent_index):
+ """Get the first feature of each sequence for classification"""
+ self._reshaped_emb_out = fluid.layers.reshape(
+ x=self._enc_out, shape=[-1, self._emb_size], inplace=True)
+ next_sent_index = fluid.layers.cast(x=next_sent_index, dtype='int32')
+ next_sent_feat = fluid.layers.gather(
+ input=self._reshaped_emb_out, index=next_sent_index)
+ next_sent_feat = fluid.layers.fc(
+ input=next_sent_feat,
+ size=self._emb_size,
+ act="tanh",
+ param_attr=fluid.ParamAttr(
+ name="pooled_fc.w_0", initializer=self._param_initializer),
+ bias_attr="pooled_fc.b_0")
+ return next_sent_feat
+
+ def get_pretraining_output(self, mask_label, mask_pos, labels,
+ next_sent_index):
+ """Get the loss & accuracy for pretraining"""
+
+ mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+ # extract the first token feature in each sentence
+ next_sent_feat = self.get_pooled_output(next_sent_index)
+ # extract masked tokens' feature
+ mask_feat = fluid.layers.gather(
+ input=self._reshaped_emb_out, index=mask_pos)
+
+ # transform: fc
+ mask_trans_feat = fluid.layers.fc(
+ input=mask_feat,
+ size=self._emb_size,
+ act=self._hidden_act,
+ param_attr=fluid.ParamAttr(
+ name='mask_lm_trans_fc.w_0',
+ initializer=self._param_initializer),
+ bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+ # transform: layer norm
+ mask_trans_feat = pre_process_layer(
+ mask_trans_feat, 'n', name='mask_lm_trans')
+
+ mask_lm_out_bias_attr = fluid.ParamAttr(
+ name="mask_lm_out_fc.b_0",
+ initializer=fluid.initializer.Constant(value=0.0))
+ if self._weight_sharing:
+ fc_out = fluid.layers.matmul(
+ x=mask_trans_feat,
+ y=fluid.default_main_program().global_block().var(
+ self._word_emb_name),
+ transpose_y=True)
+ fc_out += fluid.layers.create_parameter(
+ shape=[self._voc_size],
+ dtype=self._dtype,
+ attr=mask_lm_out_bias_attr,
+ is_bias=True)
+
+ else:
+ fc_out = fluid.layers.fc(input=mask_trans_feat,
+ size=self._voc_size,
+ param_attr=fluid.ParamAttr(
+ name="mask_lm_out_fc.w_0",
+ initializer=self._param_initializer),
+ bias_attr=mask_lm_out_bias_attr)
+
+ mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+ logits=fc_out, label=mask_label)
+ mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+ next_sent_fc_out = fluid.layers.fc(
+ input=next_sent_feat,
+ size=2,
+ param_attr=fluid.ParamAttr(
+ name="next_sent_fc.w_0", initializer=self._param_initializer),
+ bias_attr="next_sent_fc.b_0")
+
+ next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
+ logits=next_sent_fc_out, label=labels, return_softmax=True)
+
+ next_sent_acc = fluid.layers.accuracy(
+ input=next_sent_softmax, label=labels)
+
+ mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+
+ loss = mean_next_sent_loss + mean_mask_lm_loss
+ return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/ERNIE/model/transformer_encoder.py b/ERNIE/model/transformer_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a77ebe480f0e4a8e2b4f2c0c18b23383075fb7
--- /dev/null
+++ b/ERNIE/model/transformer_encoder.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+def multi_head_attention(queries,
+ keys,
+ values,
+ attn_bias,
+ d_key,
+ d_value,
+ d_model,
+ n_head=1,
+ dropout_rate=0.,
+ cache=None,
+ param_initializer=None,
+ name='multi_head_att'):
+ """
+ Multi-Head Attention. Note that attn_bias is added to the logit before
+ computing softmax activiation to mask certain selected positions so that
+ they will not considered in attention weights.
+ """
+ keys = queries if keys is None else keys
+ values = keys if values is None else values
+
+ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+ raise ValueError(
+ "Inputs: quries, keys and values should all be 3-D tensors.")
+
+ def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+ """
+ Add linear projection to queries, keys, and values.
+ """
+ q = layers.fc(input=queries,
+ size=d_key * n_head,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name=name + '_query_fc.w_0',
+ initializer=param_initializer),
+ bias_attr=name + '_query_fc.b_0')
+ k = layers.fc(input=keys,
+ size=d_key * n_head,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name=name + '_key_fc.w_0',
+ initializer=param_initializer),
+ bias_attr=name + '_key_fc.b_0')
+ v = layers.fc(input=values,
+ size=d_value * n_head,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name=name + '_value_fc.w_0',
+ initializer=param_initializer),
+ bias_attr=name + '_value_fc.b_0')
+ return q, k, v
+
+ def __split_heads(x, n_head):
+ """
+ Reshape the last dimension of inpunt tensor x so that it becomes two
+ dimensions and then transpose. Specifically, input a tensor with shape
+ [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+ with shape [bs, n_head, max_sequence_length, hidden_dim].
+ """
+ hidden_size = x.shape[-1]
+ # The value 0 in shape attr means copying the corresponding dimension
+ # size of the input as the output dimension size.
+ reshaped = layers.reshape(
+ x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+ # permuate the dimensions into:
+ # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+ return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+ def __combine_heads(x):
+ """
+ Transpose and then reshape the last two dimensions of inpunt tensor x
+ so that it becomes one dimension, which is reverse to __split_heads.
+ """
+ if len(x.shape) == 3: return x
+ if len(x.shape) != 4:
+ raise ValueError("Input(x) should be a 4-D Tensor.")
+
+ trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+ # The value 0 in shape attr means copying the corresponding dimension
+ # size of the input as the output dimension size.
+ return layers.reshape(
+ x=trans_x,
+ shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+ inplace=True)
+
+ def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+ """
+ Scaled Dot-Product Attention
+ """
+ scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+ product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+ if attn_bias:
+ product += attn_bias
+ weights = layers.softmax(product)
+ if dropout_rate:
+ weights = layers.dropout(
+ weights,
+ dropout_prob=dropout_rate,
+ dropout_implementation="upscale_in_train",
+ is_test=False)
+ out = layers.matmul(weights, v)
+ return out
+
+ q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+ if cache is not None: # use cache and concat time steps
+ # Since the inplace reshape in __split_heads changes the shape of k and
+ # v, which is the cache input for next time step, reshape the cache
+ # input from the previous time step first.
+ k = cache["k"] = layers.concat(
+ [layers.reshape(
+ cache["k"], shape=[0, 0, d_model]), k], axis=1)
+ v = cache["v"] = layers.concat(
+ [layers.reshape(
+ cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+ q = __split_heads(q, n_head)
+ k = __split_heads(k, n_head)
+ v = __split_heads(v, n_head)
+
+ ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+ dropout_rate)
+
+ out = __combine_heads(ctx_multiheads)
+
+ # Project back to the model size.
+ proj_out = layers.fc(input=out,
+ size=d_model,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name=name + '_output_fc.w_0',
+ initializer=param_initializer),
+ bias_attr=name + '_output_fc.b_0')
+ return proj_out
+
+
+def positionwise_feed_forward(x,
+ d_inner_hid,
+ d_hid,
+ dropout_rate,
+ hidden_act,
+ param_initializer=None,
+ name='ffn'):
+ """
+ Position-wise Feed-Forward Networks.
+ This module consists of two linear transformations with a ReLU activation
+ in between, which is applied to each position separately and identically.
+ """
+ hidden = layers.fc(input=x,
+ size=d_inner_hid,
+ num_flatten_dims=2,
+ act=hidden_act,
+ param_attr=fluid.ParamAttr(
+ name=name + '_fc_0.w_0',
+ initializer=param_initializer),
+ bias_attr=name + '_fc_0.b_0')
+ if dropout_rate:
+ hidden = layers.dropout(
+ hidden,
+ dropout_prob=dropout_rate,
+ dropout_implementation="upscale_in_train",
+ is_test=False)
+ out = layers.fc(input=hidden,
+ size=d_hid,
+ num_flatten_dims=2,
+ param_attr=fluid.ParamAttr(
+ name=name + '_fc_1.w_0', initializer=param_initializer),
+ bias_attr=name + '_fc_1.b_0')
+ return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+ name=''):
+ """
+ Add residual connection, layer normalization and droput to the out tensor
+ optionally according to the value of process_cmd.
+ This will be used before or after multi-head attention and position-wise
+ feed-forward networks.
+ """
+ for cmd in process_cmd:
+ if cmd == "a": # add residual connection
+ out = out + prev_out if prev_out else out
+ elif cmd == "n": # add layer normalization
+ out_dtype = out.dtype
+ if out_dtype == fluid.core.VarDesc.VarType.FP16:
+ out = layers.cast(x=out, dtype="float32")
+ out = layers.layer_norm(
+ out,
+ begin_norm_axis=len(out.shape) - 1,
+ param_attr=fluid.ParamAttr(
+ name=name + '_layer_norm_scale',
+ initializer=fluid.initializer.Constant(1.)),
+ bias_attr=fluid.ParamAttr(
+ name=name + '_layer_norm_bias',
+ initializer=fluid.initializer.Constant(0.)))
+ if out_dtype == fluid.core.VarDesc.VarType.FP16:
+ out = layers.cast(x=out, dtype="float16")
+ elif cmd == "d": # add dropout
+ if dropout_rate:
+ out = layers.dropout(
+ out,
+ dropout_prob=dropout_rate,
+ dropout_implementation="upscale_in_train",
+ is_test=False)
+ return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+ attn_bias,
+ n_head,
+ d_key,
+ d_value,
+ d_model,
+ d_inner_hid,
+ prepostprocess_dropout,
+ attention_dropout,
+ relu_dropout,
+ hidden_act,
+ preprocess_cmd="n",
+ postprocess_cmd="da",
+ param_initializer=None,
+ name=''):
+ """The encoder layers that can be stacked to form a deep encoder.
+ This module consits of a multi-head (self) attention followed by
+ position-wise feed-forward networks and both the two components companied
+ with the post_process_layer to add residual connection, layer normalization
+ and droput.
+ """
+ attn_output = multi_head_attention(
+ pre_process_layer(
+ enc_input,
+ preprocess_cmd,
+ prepostprocess_dropout,
+ name=name + '_pre_att'),
+ None,
+ None,
+ attn_bias,
+ d_key,
+ d_value,
+ d_model,
+ n_head,
+ attention_dropout,
+ param_initializer=param_initializer,
+ name=name + '_multi_head_att')
+ attn_output = post_process_layer(
+ enc_input,
+ attn_output,
+ postprocess_cmd,
+ prepostprocess_dropout,
+ name=name + '_post_att')
+ ffd_output = positionwise_feed_forward(
+ pre_process_layer(
+ attn_output,
+ preprocess_cmd,
+ prepostprocess_dropout,
+ name=name + '_pre_ffn'),
+ d_inner_hid,
+ d_model,
+ relu_dropout,
+ hidden_act,
+ param_initializer=param_initializer,
+ name=name + '_ffn')
+ return post_process_layer(
+ attn_output,
+ ffd_output,
+ postprocess_cmd,
+ prepostprocess_dropout,
+ name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+ attn_bias,
+ n_layer,
+ n_head,
+ d_key,
+ d_value,
+ d_model,
+ d_inner_hid,
+ prepostprocess_dropout,
+ attention_dropout,
+ relu_dropout,
+ hidden_act,
+ preprocess_cmd="n",
+ postprocess_cmd="da",
+ param_initializer=None,
+ name=''):
+ """
+ The encoder is composed of a stack of identical layers returned by calling
+ encoder_layer.
+ """
+ for i in range(n_layer):
+ enc_output = encoder_layer(
+ enc_input,
+ attn_bias,
+ n_head,
+ d_key,
+ d_value,
+ d_model,
+ d_inner_hid,
+ prepostprocess_dropout,
+ attention_dropout,
+ relu_dropout,
+ hidden_act,
+ preprocess_cmd,
+ postprocess_cmd,
+ param_initializer=param_initializer,
+ name=name + '_layer_' + str(i))
+ enc_input = enc_output
+ enc_output = pre_process_layer(
+ enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+ return enc_output
diff --git a/ERNIE/optimization.py b/ERNIE/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e010bca9cdd57dd90e34961f2b1102888627d27a
--- /dev/null
+++ b/ERNIE/optimization.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+ """ Applies linear warmup of learning rate from 0 and decay to 0."""
+ with fluid.default_main_program()._lr_schedule_guard():
+ lr = fluid.layers.tensor.create_global_var(
+ shape=[1],
+ value=0.0,
+ dtype='float32',
+ persistable=True,
+ name="scheduled_learning_rate")
+
+ global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+ with fluid.layers.control_flow.Switch() as switch:
+ with switch.case(global_step < warmup_steps):
+ warmup_lr = learning_rate * (global_step / warmup_steps)
+ fluid.layers.tensor.assign(warmup_lr, lr)
+ with switch.default():
+ decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+ learning_rate=learning_rate,
+ decay_steps=num_train_steps,
+ end_learning_rate=0.0,
+ power=1.0,
+ cycle=False)
+ fluid.layers.tensor.assign(decayed_lr, lr)
+
+ return lr
+
+
+def optimization(loss,
+ warmup_steps,
+ num_train_steps,
+ learning_rate,
+ train_program,
+ startup_prog,
+ weight_decay,
+ scheduler='linear_warmup_decay',
+ use_fp16=False,
+ loss_scaling=1.0):
+ if warmup_steps > 0:
+ if scheduler == 'noam_decay':
+ scheduled_lr = fluid.layers.learning_rate_scheduler\
+ .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+ warmup_steps)
+ elif scheduler == 'linear_warmup_decay':
+ scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+ num_train_steps)
+ else:
+ raise ValueError("Unkown learning rate scheduler, should be "
+ "'noam_decay' or 'linear_warmup_decay'")
+ optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+ else:
+ optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
+ scheduled_lr = learning_rate
+
+ clip_norm_thres = 1.0
+ # When using mixed precision training, scale the gradient clip threshold
+ # by loss_scaling
+ if use_fp16 and loss_scaling > 1.0:
+ clip_norm_thres *= loss_scaling
+ fluid.clip.set_gradient_clip(
+ clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
+
+ def exclude_from_weight_decay(name):
+ if name.find("layer_norm") > -1:
+ return True
+ bias_suffix = ["_bias", "_b", ".b_0"]
+ for suffix in bias_suffix:
+ if name.endswith(suffix):
+ return True
+ return False
+
+ param_list = dict()
+
+ if use_fp16:
+ param_grads = optimizer.backward(loss)
+ master_param_grads = create_master_params_grads(
+ param_grads, train_program, startup_prog, loss_scaling)
+
+ for param, _ in master_param_grads:
+ param_list[param.name] = param * 1.0
+ param_list[param.name].stop_gradient = True
+
+ optimizer.apply_gradients(master_param_grads)
+
+ if weight_decay > 0:
+ for param, grad in master_param_grads:
+ if exclude_from_weight_decay(param.name.rstrip(".master")):
+ continue
+ with param.block.program._optimized_guard(
+ [param, grad]), fluid.framework.name_scope("weight_decay"):
+ updated_param = param - param_list[
+ param.name] * weight_decay * scheduled_lr
+ fluid.layers.assign(output=param, input=updated_param)
+
+ master_param_to_train_param(master_param_grads, param_grads,
+ train_program)
+
+ else:
+ for param in train_program.global_block().all_parameters():
+ param_list[param.name] = param * 1.0
+ param_list[param.name].stop_gradient = True
+
+ _, param_grads = optimizer.minimize(loss)
+
+ if weight_decay > 0:
+ for param, grad in param_grads:
+ if exclude_from_weight_decay(param.name):
+ continue
+ with param.block.program._optimized_guard(
+ [param, grad]), fluid.framework.name_scope("weight_decay"):
+ updated_param = param - param_list[
+ param.name] * weight_decay * scheduled_lr
+ fluid.layers.assign(output=param, input=updated_param)
+
+ return scheduled_lr
diff --git a/ERNIE/pretrain_args.py b/ERNIE/pretrain_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..2543a346ff7cab114a9418978c2e5dfc5b018a5a
--- /dev/null
+++ b/ERNIE/pretrain_args.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+
+from utils.args import ArgumentGroup, print_arguments
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.")
+model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
+model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
+model_g.add_arg("weight_sharing", bool, True, "If set, share weights between word embedding and masked lm.")
+model_g.add_arg("generate_neg_sample", bool, False, "If set, randomly generate negtive samples by positive samples.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch", int, 100, "Number of epoches for training.")
+train_g.add_arg("learning_rate", float, 0.0001, "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
+ "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
+train_g.add_arg("num_train_steps", int, 1000000, "Total steps to perform pretraining.")
+train_g.add_arg("warmup_steps", int, 4000, "Total steps to perform warmup when pretraining.")
+train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
+train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
+train_g.add_arg("loss_scaling", float, 1.0,
+ "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
+log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("train_filelist", str, "", "Path to training filelist.")
+data_g.add_arg("valid_filelist", str, "", "Path to valid filelist.")
+data_g.add_arg("test_filelist", str, "", "Path to test filelist.")
+data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.")
+data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
+data_g.add_arg("batch_size", int, 16, "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("in_tokens", bool, False,
+ "If set, the batch size will be the maximum number of tokens in one batch. "
+ "Otherwise, it will be the maximum number of examples in one batch.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.")
+run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
+run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("do_test", bool, False, "Whether to perform evaluation on test data set.")
+# yapf: enable
diff --git a/ERNIE/reader/__init__.py b/ERNIE/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ERNIE/reader/pretraining.py b/ERNIE/reader/pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c3d883ec66571b910e34bc3963db6e6778e2d2
--- /dev/null
+++ b/ERNIE/reader/pretraining.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import os
+import numpy as np
+import types
+import gzip
+import logging
+import re
+import six
+import collections
+import tokenization
+
+import paddle
+import paddle.fluid as fluid
+
+from batching import prepare_batch_data
+
+class ErnieDataReader(object):
+ def __init__(self,
+ filelist,
+ vocab_path,
+ batch_size=4096,
+ max_seq_len=512,
+ shuffle_files=True,
+ epoch=100,
+ voc_size=0,
+ is_test=False,
+ generate_neg_sample=False):
+
+ self.vocab = self.load_vocab(vocab_path)
+ self.filelist = filelist
+ self.batch_size = batch_size
+ self.shuffle_files = shuffle_files
+ self.epoch = epoch
+ self.current_epoch = 0
+ self.current_file_index = 0
+ self.total_file = 0
+ self.current_file = None
+ self.voc_size = voc_size
+ self.max_seq_len = max_seq_len
+ self.pad_id = self.vocab["[PAD]"]
+ self.cls_id = self.vocab["[CLS]"]
+ self.sep_id = self.vocab["[SEP]"]
+ self.mask_id = self.vocab["[MASK]"]
+ self.is_test = is_test
+ self.generate_neg_sample = generate_neg_sample
+ assert self.batch_size > 100, "Current batch size means total token's number, \
+ it should not be set to too small number."
+
+ if self.is_test:
+ self.epoch = 1
+ self.shuffle_files = False
+
+ def get_progress(self):
+ """return current progress of traning data
+ """
+ return self.current_epoch, self.current_file_index, self.total_file, self.current_file, self.mask_type
+
+ def parse_line(self, line, max_seq_len=512):
+ """ parse one line to token_ids, sentence_ids, pos_ids, label
+ """
+ line = line.strip().split(";")
+ assert len(line) == 5, "One sample must have 5 fields!"
+ (token_ids, sent_ids, pos_ids, seg_labels, label) = line
+ token_ids = [int(token) for token in token_ids.split(" ")]
+ sent_ids = [int(token) for token in sent_ids.split(" ")]
+ pos_ids = [int(token) for token in pos_ids.split(" ")]
+ seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")]
+ assert len(token_ids) == len(sent_ids) == len(
+ pos_ids) == len(seg_labels
+ ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
+ label = int(label)
+ if len(token_ids) > max_seq_len:
+ return None
+ return [token_ids, sent_ids, pos_ids, label, seg_labels]
+
+ def read_file(self, file):
+ assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
+ with gzip.open(file, "rb") as f:
+ for line in f:
+ parsed_line = self.parse_line(
+ line, max_seq_len=self.max_seq_len)
+ if parsed_line is None:
+ continue
+ yield parsed_line
+
+ def convert_to_unicode(self, text):
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+ if six.PY3:
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
+ return text.decode("utf-8", "ignore")
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ elif six.PY2:
+ if isinstance(text, str):
+ return text.decode("utf-8", "ignore")
+ elif isinstance(text, unicode):
+ return text
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ else:
+ raise ValueError("Not running on Python2 or Python 3?")
+
+ def load_vocab(self, vocab_file):
+ """Loads a vocabulary file into a dictionary."""
+ vocab = collections.OrderedDict()
+ fin = open(vocab_file)
+ for num, line in enumerate(fin):
+ items = self.convert_to_unicode(line.strip()).split("\t")
+ if len(items) > 2:
+ break
+ token = items[0]
+ index = items[1] if len(items) == 2 else num
+ token = token.strip()
+ vocab[token] = int(index)
+ return vocab
+
+ def random_pair_neg_samples(self, pos_samples):
+ """ randomly generate negtive samples using pos_samples
+
+ Args:
+ pos_samples: list of positive samples
+
+ Returns:
+ neg_samples: list of negtive samples
+ """
+ np.random.shuffle(pos_samples)
+ num_sample = len(pos_samples)
+ neg_samples = []
+ miss_num = 0
+
+ def split_sent(sample, max_len, sep_id):
+ token_seq, type_seq, pos_seq, label, seg_labels = sample
+ sep_index = token_seq.index(sep_id)
+ left_len = sep_index - 1
+ if left_len <= max_len:
+ return (token_seq[1:sep_index], seg_labels[1:sep_index])
+ else:
+ return [token_seq[sep_index + 1: -1], seg_labels[sep_index + 1 : -1]]
+
+ for i in range(num_sample):
+ pair_index = (i + 1) % num_sample
+ left_tokens, left_seg_labels = split_sent(pos_samples[i],
+ (self.max_seq_len - 3) // 2, self.sep_id)
+ right_tokens, right_seg_labels = split_sent(pos_samples[pair_index],
+ self.max_seq_len - 3 - len(left_tokens), self.sep_id)
+
+ token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \
+ right_tokens + [self.sep_id]
+ if len(token_seq) > self.max_seq_len:
+ miss_num += 1
+ continue
+ type_seq = [0] * (len(left_tokens) + 2) + [1] * (len(right_tokens) + 1)
+ pos_seq = range(len(token_seq))
+ seg_label_seq = [-1] + left_seg_labels + [-1] + right_seg_labels + [-1]
+
+ assert len(token_seq) == len(type_seq) == len(pos_seq) == len(seg_label_seq), \
+ "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
+ neg_samples.append([token_seq, type_seq, pos_seq, 0, seg_label_seq])
+
+ return neg_samples, miss_num
+
+ def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
+ """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
+ 2. combine negtive samples and positive samples
+
+ Args:
+ pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
+
+ Returns:
+ sample: one sample from shuffled positive samples and negtive samples
+ """
+ pos_samples = []
+ num_total_miss = 0
+ pos_sample_num = 0
+ try:
+ while True:
+ while len(pos_samples) < buffer:
+ pos_sample = next(pos_sample_generator)
+ label = pos_sample[3]
+ assert label == 1, "positive sample's label must be 1"
+ pos_samples.append(pos_sample)
+ pos_sample_num += 1
+
+ neg_samples, miss_num = self.random_pair_neg_samples(
+ pos_samples)
+ num_total_miss += miss_num
+ samples = pos_samples + neg_samples
+ pos_samples = []
+ np.random.shuffle(samples)
+ for sample in samples:
+ yield sample
+ except StopIteration:
+ print("stopiteration: reach end of file")
+ if len(pos_samples) == 1:
+ yield pos_samples[0]
+ elif len(pos_samples) == 0:
+ yield None
+ else:
+ neg_samples, miss_num = self.random_pair_neg_samples(
+ pos_samples)
+ num_total_miss += miss_num
+ samples = pos_samples + neg_samples
+ pos_samples = []
+ np.random.shuffle(samples)
+ for sample in samples:
+ yield sample
+ print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
+ (num_total_miss, pos_sample_num * 2,
+ num_total_miss / (pos_sample_num * 2)))
+
+ def data_generator(self):
+ """
+ data_generator
+ """
+ files = open(self.filelist).readlines()
+ self.total_file = len(files)
+ assert self.total_file > 0, "[Error] data_dir is empty"
+
+ def wrapper():
+ def reader():
+ for epoch in range(self.epoch):
+ self.current_epoch = epoch + 1
+ if self.shuffle_files:
+ np.random.shuffle(files)
+ for index, file in enumerate(files):
+ file, mask_word_prob = file.strip().split("\t")
+ mask_word = (np.random.random() < float(mask_word_prob))
+ self.current_file_index = index + 1
+ self.current_file = file
+ if mask_word:
+ self.mask_type = "mask_word"
+ else:
+ self.mask_type = "mask_char"
+
+ sample_generator = self.read_file(file)
+ if not self.is_test and self.generate_neg_sample:
+ sample_generator = self.mixin_negtive_samples(
+ sample_generator)
+ for sample in sample_generator:
+ if sample is None:
+ continue
+ sample.append(mask_word)
+ yield sample
+
+ def batch_reader(reader, batch_size):
+ batch, total_token_num, max_len = [], 0, 0
+ for parsed_line in reader():
+ token_ids, sent_ids, pos_ids, label, seg_labels, mask_word = parsed_line
+ max_len = max(max_len, len(token_ids))
+ if (len(batch) + 1) * max_len <= batch_size:
+ batch.append(parsed_line)
+ total_token_num += len(token_ids)
+ else:
+ yield batch, total_token_num
+ batch, total_token_num, max_len = [parsed_line], len(
+ token_ids), len(token_ids)
+
+ if len(batch) > 0:
+ yield batch, total_token_num
+
+ for batch_data, total_token_num in batch_reader(reader,
+ self.batch_size):
+ yield prepare_batch_data(
+ batch_data,
+ total_token_num,
+ voc_size=self.voc_size,
+ pad_id=self.pad_id,
+ cls_id=self.cls_id,
+ sep_id=self.sep_id,
+ mask_id=self.mask_id,
+ return_attn_bias=True,
+ return_max_len=False,
+ return_num_token=False)
+
+ return wrapper
+
+
+if __name__ == "__main__":
+ pass
diff --git a/ERNIE/reader/task_reader.py b/ERNIE/reader/task_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..28aacbdbfc9c544a170f6a969605eb464112d335
--- /dev/null
+++ b/ERNIE/reader/task_reader.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import csv
+import json
+import numpy as np
+from collections import namedtuple
+
+import tokenization
+from batching import pad_batch_data
+
+
+class BaseReader(object):
+ def __init__(self,
+ vocab_path,
+ label_map_config=None,
+ max_seq_len=512,
+ do_lower_case=True,
+ in_tokens=False,
+ random_seed=None):
+ self.max_seq_len = max_seq_len
+ self.tokenizer = tokenization.FullTokenizer(
+ vocab_file=vocab_path, do_lower_case=do_lower_case)
+ self.vocab = self.tokenizer.vocab
+ self.pad_id = self.vocab["[PAD]"]
+ self.cls_id = self.vocab["[CLS]"]
+ self.sep_id = self.vocab["[SEP]"]
+ self.in_tokens = in_tokens
+
+ np.random.seed(random_seed)
+
+ self.current_example = 0
+ self.current_epoch = 0
+ self.num_examples = 0
+
+ if label_map_config:
+ with open(label_map_config) as f:
+ self.label_map = json.load(f)
+ else:
+ self.label_map = None
+ pass
+
+ def get_train_progress(self):
+ """Gets progress for training phase."""
+ return self.current_example, self.current_epoch
+
+ def _read_tsv(self, input_file, quotechar=None):
+ """Reads a tab separated value file."""
+ with open(input_file, "r") as f:
+ reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+ headers = next(reader)
+ Example = namedtuple('Example', headers)
+
+ examples = []
+ for line in reader:
+ example = Example(*line)
+ examples.append(example)
+ return examples
+
+ def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+ """Truncates a sequence pair in place to the maximum length."""
+
+ # This is a simple heuristic which will always truncate the longer sequence
+ # one token at a time. This makes more sense than truncating an equal percent
+ # of tokens from each, since if one sequence is very short then each token
+ # that's truncated likely contains more information than a longer sequence.
+ while True:
+ total_length = len(tokens_a) + len(tokens_b)
+ if total_length <= max_length:
+ break
+ if len(tokens_a) > len(tokens_b):
+ tokens_a.pop()
+ else:
+ tokens_b.pop()
+
+ def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+ """Converts a single `Example` into a single `Record`."""
+
+ text_a = tokenization.convert_to_unicode(example.text_a)
+ tokens_a = tokenizer.tokenize(text_a)
+ tokens_b = None
+ if "text_b" in example._fields:
+ text_b = tokenization.convert_to_unicode(example.text_b)
+ tokens_b = tokenizer.tokenize(text_b)
+
+ if tokens_b:
+ # Modifies `tokens_a` and `tokens_b` in place so that the total
+ # length is less than the specified length.
+ # Account for [CLS], [SEP], [SEP] with "- 3"
+ self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+ else:
+ # Account for [CLS] and [SEP] with "- 2"
+ if len(tokens_a) > max_seq_length - 2:
+ tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+ # The convention in BERT/ERNIE is:
+ # (a) For sequence pairs:
+ # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+ # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
+ # (b) For single sequences:
+ # tokens: [CLS] the dog is hairy . [SEP]
+ # type_ids: 0 0 0 0 0 0 0
+ #
+ # Where "type_ids" are used to indicate whether this is the first
+ # sequence or the second sequence. The embedding vectors for `type=0` and
+ # `type=1` were learned during pre-training and are added to the wordpiece
+ # embedding vector (and position vector). This is not *strictly* necessary
+ # since the [SEP] token unambiguously separates the sequences, but it makes
+ # it easier for the model to learn the concept of sequences.
+ #
+ # For classification tasks, the first vector (corresponding to [CLS]) is
+ # used as as the "sentence vector". Note that this only makes sense because
+ # the entire model is fine-tuned.
+ tokens = []
+ text_type_ids = []
+ tokens.append("[CLS]")
+ text_type_ids.append(0)
+ for token in tokens_a:
+ tokens.append(token)
+ text_type_ids.append(0)
+ tokens.append("[SEP]")
+ text_type_ids.append(0)
+
+ if tokens_b:
+ for token in tokens_b:
+ tokens.append(token)
+ text_type_ids.append(1)
+ tokens.append("[SEP]")
+ text_type_ids.append(1)
+
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
+ position_ids = list(range(len(token_ids)))
+
+ if self.label_map:
+ label_id = self.label_map[example.label]
+ else:
+ label_id = example.label
+
+ Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
+
+ qid = None
+ if "qid" in example._fields:
+ qid = example.qid
+
+ record = Record(
+ token_ids=token_ids,
+ text_type_ids=text_type_ids,
+ position_ids=position_ids,
+ label_id=label_id,
+ qid=qid)
+ return record
+
+ def _prepare_batch_data(self, examples, batch_size, phase=None):
+ """generate batch records"""
+ batch_records, max_len = [], 0
+ for index, example in enumerate(examples):
+ if phase == "train":
+ self.current_example = index
+ record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer)
+ max_len = max(max_len, len(record.token_ids))
+ if self.in_tokens:
+ to_append = (len(batch_records) + 1) * max_len <= batch_size
+ else:
+ to_append = len(batch_records) < batch_size
+ if to_append:
+ batch_records.append(record)
+ else:
+ yield self._pad_batch_records(batch_records)
+ batch_records, max_len = [record], len(record.token_ids)
+
+ if len(batch_records) > 0:
+ yield self._pad_batch_records(batch_records)
+
+ def get_num_examples(self, input_file):
+ examples = self._read_tsv(input_file)
+ return len(examples)
+
+ def data_generator(self, input_file, batch_size, epoch, shuffle=True, phase=None):
+ examples = self._read_tsv(input_file)
+
+ def wrapper():
+ for epoch_index in range(epoch):
+ if phase == "train":
+ self.current_example = 0
+ self.current_epoch = epoch_index
+ if shuffle:
+ np.random.shuffle(examples)
+
+ for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase):
+ yield batch_data
+ return wrapper
+
+
+class ClassifyReader(BaseReader):
+ def _read_tsv(self, input_file, quotechar=None):
+ """Reads a tab separated value file."""
+ with open(input_file, "r") as f:
+ reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+ headers = next(reader)
+ text_indices = [index for index, h in enumerate(headers) if h != "label"]
+ Example = namedtuple('Example', headers)
+
+ examples = []
+ for line in reader:
+ for index, text in enumerate(line):
+ if index in text_indices:
+ line[index] = text.replace(' ', '')
+ example = Example(*line)
+ examples.append(example)
+ return examples
+
+ def _pad_batch_records(self, batch_records):
+ batch_token_ids = [record.token_ids for record in batch_records]
+ batch_text_type_ids = [record.text_type_ids for record in batch_records]
+ batch_position_ids = [record.position_ids for record in batch_records]
+ batch_labels = [record.label_id for record in batch_records]
+ batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
+
+ if batch_records[0].qid:
+ batch_qids = [record.qid for record in batch_records]
+ batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
+ else:
+ batch_qids = np.array([]).astype("int64").reshape([-1, 1])
+
+ # padding
+ padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
+ batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True)
+ padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id)
+ padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id)
+
+ return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids]
+
+ return return_list
+
+
+class SequenceLabelReader(BaseReader):
+
+ def _pad_batch_records(self, batch_records):
+ batch_token_ids = [record.token_ids for record in batch_records]
+ batch_text_type_ids = [record.text_type_ids for record in batch_records]
+ batch_position_ids = [record.position_ids for record in batch_records]
+ batch_label_ids = [record.label_ids for record in batch_records]
+ batch_seq_lens = [len(record.token_ids) for record in batch_records]
+
+ # padding
+ padded_token_ids, self_attn_bias = pad_batch_data(
+ batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True)
+ padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id)
+ padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id)
+ padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map)-1)
+ batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape([-1, 1])
+
+ return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens]
+ return return_list
+
+ def _reseg_token_label(self, tokens, labels, tokenizer):
+ assert len(tokens) == len(labels)
+ ret_tokens = []
+ ret_labels = []
+ for token, label in zip(tokens, labels):
+ sub_token = tokenizer.tokenize(token)
+ if len(sub_token) == 0:
+ continue
+ ret_tokens.extend(sub_token)
+ ret_labels.append(label)
+ if len(sub_token) < 2:
+ continue
+ sub_label = label
+ if label.startswith("B-"):
+ sub_label = "I-" + label[2:]
+ ret_labels.extend([sub_label] * (len(sub_token) - 1))
+
+ assert len(ret_tokens) == len(ret_labels)
+ return ret_tokens, ret_labels
+
+ def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+ tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
+ labels = tokenization.convert_to_unicode(example.label).split(u"")
+ tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
+
+ if len(tokens) > max_seq_length - 2:
+ tokens = tokens[0:(max_seq_length - 2)]
+ labels = labels[0:(max_seq_length - 2)]
+
+ tokens = ["[CLS]"] + tokens + ["[SEP]"]
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
+ position_ids = list(range(len(token_ids)))
+ text_type_ids = [0] * len(token_ids)
+ no_entity_id = len(self.label_map) - 1
+ label_ids = [no_entity_id] + [self.label_map[label] for label in labels] + [no_entity_id]
+
+ Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
+ record = Record(
+ token_ids=token_ids,
+ text_type_ids=text_type_ids,
+ position_ids=position_ids,
+ label_ids=label_ids)
+ return record
+
+if __name__ == '__main__':
+ pass
diff --git a/ERNIE/run_classifier.py b/ERNIE/run_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..a891a37f9a3dadb089d8d07644c98a90ef154378
--- /dev/null
+++ b/ERNIE/run_classifier.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on classification tasks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+
+import paddle
+import paddle.fluid as fluid
+
+import reader.task_reader as task_reader
+from model.ernie import ErnieConfig
+from finetune.classifier import create_model, evaluate
+from optimization import optimization
+from utils.args import ArgumentGroup, print_arguments
+from utils.init import init_pretraining_params, init_checkpoint
+from finetune_args import parser
+
+
+args = parser.parse_args()
+
+def main(args):
+ ernie_config = ErnieConfig(args.ernie_config_path)
+ ernie_config.print_config()
+
+ if args.use_cuda:
+ place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+ dev_count = fluid.core.get_cuda_device_count()
+ else:
+ place = fluid.CPUPlace()
+ dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+ exe = fluid.Executor(place)
+
+ reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
+ label_map_config=args.label_map_config,
+ max_seq_len=args.max_seq_len,
+ do_lower_case=args.do_lower_case,
+ in_tokens=args.in_tokens,
+ random_seed=args.random_seed)
+
+ if not (args.do_train or args.do_val or args.do_test):
+ raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
+ "least one of them must be True.")
+
+ startup_prog = fluid.Program()
+ if args.random_seed is not None:
+ startup_prog.random_seed = args.random_seed
+
+ if args.do_train:
+ train_data_generator = reader.data_generator(
+ input_file=args.train_set,
+ batch_size=args.batch_size,
+ epoch=args.epoch,
+ shuffle=True,
+ phase="train")
+
+ num_train_examples = reader.get_num_examples(args.train_set)
+
+ if args.in_tokens:
+ max_train_steps = args.epoch * num_train_examples // (
+ args.batch_size // args.max_seq_len) // dev_count
+ else:
+ max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
+
+ warmup_steps = int(max_train_steps * args.warmup_proportion)
+ print("Device count: %d" % dev_count)
+ print("Num train examples: %d" % num_train_examples)
+ print("Max train steps: %d" % max_train_steps)
+ print("Num warmup steps: %d" % warmup_steps)
+
+ train_program = fluid.Program()
+
+ with fluid.program_guard(train_program, startup_prog):
+ with fluid.unique_name.guard():
+ train_pyreader, graph_vars = create_model(
+ args,
+ pyreader_name='train_reader',
+ ernie_config=ernie_config)
+ scheduled_lr = optimization(
+ loss=graph_vars["loss"],
+ warmup_steps=warmup_steps,
+ num_train_steps=max_train_steps,
+ learning_rate=args.learning_rate,
+ train_program=train_program,
+ startup_prog=startup_prog,
+ weight_decay=args.weight_decay,
+ scheduler=args.lr_scheduler,
+ use_fp16=args.use_fp16,
+ loss_scaling=args.loss_scaling)
+
+ fluid.memory_optimize(
+ input_program=train_program,
+ skip_opt_set=[graph_vars["loss"].name,
+ graph_vars["probs"].name,
+ graph_vars["accuracy"].name,
+ graph_vars["num_seqs"].name,
+ ])
+
+ if args.verbose:
+ if args.in_tokens:
+ lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+ program=train_program,
+ batch_size=args.batch_size // args.max_seq_len)
+ else:
+ lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+ program=train_program, batch_size=args.batch_size)
+ print("Theoretical memory usage in training: %.3f - %.3f %s" %
+ (lower_mem, upper_mem, unit))
+
+ if args.do_val or args.do_test:
+ test_prog = fluid.Program()
+ with fluid.program_guard(test_prog, startup_prog):
+ with fluid.unique_name.guard():
+ test_pyreader, graph_vars = create_model(
+ args,
+ pyreader_name='test_reader',
+ ernie_config=ernie_config)
+
+ test_prog = test_prog.clone(for_test=True)
+
+ exe.run(startup_prog)
+
+ if args.do_train:
+ if args.init_checkpoint and args.init_pretraining_params:
+ print(
+ "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+ "both are set! Only arg 'init_checkpoint' is made valid.")
+ if args.init_checkpoint:
+ init_checkpoint(
+ exe,
+ args.init_checkpoint,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+ elif args.init_pretraining_params:
+ init_pretraining_params(
+ exe,
+ args.init_pretraining_params,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+ elif args.do_val or args.do_test:
+ if not args.init_checkpoint:
+ raise ValueError("args 'init_checkpoint' should be set if"
+ "only doing validation or testing!")
+ init_checkpoint(
+ exe,
+ args.init_checkpoint,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+
+ if args.do_train:
+ exec_strategy = fluid.ExecutionStrategy()
+ if args.use_fast_executor:
+ exec_strategy.use_experimental_executor = True
+ exec_strategy.num_threads = dev_count
+ exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ loss_name=graph_vars["loss"].name,
+ exec_strategy=exec_strategy,
+ main_program=train_program)
+
+ train_pyreader.decorate_tensor_provider(train_data_generator)
+ else:
+ train_exe = None
+
+ if args.do_val or args.do_test:
+ test_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ main_program=test_prog,
+ share_vars_from=train_exe)
+
+ if args.do_train:
+ train_pyreader.start()
+ steps = 0
+ if warmup_steps > 0:
+ graph_vars["learning_rate"] = scheduled_lr
+
+ time_begin = time.time()
+ while True:
+ try:
+ steps += 1
+ if steps % args.skip_steps != 0:
+ train_exe.run(fetch_list=[])
+ else:
+ outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train")
+
+ if args.verbose:
+ verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+ )
+ verbose += "learning rate: %f" % (
+ outputs["learning_rate"]
+ if warmup_steps > 0 else args.learning_rate)
+ print(verbose)
+
+ current_example, current_epoch = reader.get_train_progress()
+ time_end = time.time()
+ used_time = time_end - time_begin
+ print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
+ "ave acc: %f, speed: %f steps/s" %
+ (current_epoch, current_example, num_train_examples,
+ steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time))
+ time_begin = time.time()
+
+ if steps % args.save_steps == 0:
+ save_path = os.path.join(args.checkpoints,
+ "step_" + str(steps))
+ fluid.io.save_persistables(exe, save_path, train_program)
+
+ if steps % args.validation_steps == 0:
+ # evaluate dev set
+ if args.do_val:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.dev_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ evaluate(exe, test_prog, test_pyreader, graph_vars, "dev")
+ # evaluate test set
+ if args.do_test:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.test_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ evaluate(exe, test_prog, test_pyreader, graph_vars, "test")
+ except fluid.core.EOFException:
+ save_path = os.path.join(args.checkpoints, "step_" + str(steps))
+ fluid.io.save_persistables(exe, save_path, train_program)
+ train_pyreader.reset()
+ break
+
+ # final eval on dev set
+ if args.do_val:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.dev_set, batch_size=args.batch_size, epoch=1,
+ shuffle=False))
+ print("Final validation result:")
+ evaluate(exe, test_prog, test_pyreader, graph_vars, "dev")
+
+ # final eval on test set
+ if args.do_test:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.test_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ print("Final test result:")
+ evaluate(exe, test_prog, test_pyreader, graph_vars, "test")
+
+
+if __name__ == '__main__':
+ print_arguments(args)
+ main(args)
diff --git a/ERNIE/run_sequence_labeling.py b/ERNIE/run_sequence_labeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..71158cf219e1583ea6427448a947740cb86ecc06
--- /dev/null
+++ b/ERNIE/run_sequence_labeling.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning on classification tasks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+import multiprocessing
+
+import paddle
+import paddle.fluid as fluid
+
+import reader.task_reader as task_reader
+from model.ernie import ErnieConfig
+from optimization import optimization
+from utils.init import init_pretraining_params, init_checkpoint
+from utils.args import print_arguments
+from finetune.sequence_label import create_model, evaluate
+from finetune_args import parser
+
+
+args = parser.parse_args()
+
+def main(args):
+ ernie_config = ErnieConfig(args.ernie_config_path)
+ ernie_config.print_config()
+
+ if args.use_cuda:
+ place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+ dev_count = fluid.core.get_cuda_device_count()
+ else:
+ place = fluid.CPUPlace()
+ dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+ exe = fluid.Executor(place)
+
+ reader = task_reader.SequenceLabelReader(vocab_path=args.vocab_path,
+ label_map_config=args.label_map_config,
+ max_seq_len=args.max_seq_len,
+ do_lower_case=args.do_lower_case,
+ in_tokens=args.in_tokens,
+ random_seed=args.random_seed)
+
+ if not (args.do_train or args.do_val or args.do_test):
+ raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
+ "least one of them must be True.")
+
+ startup_prog = fluid.Program()
+ if args.random_seed is not None:
+ startup_prog.random_seed = args.random_seed
+
+ if args.do_train:
+ train_data_generator = reader.data_generator(
+ input_file=args.train_set,
+ batch_size=args.batch_size,
+ epoch=args.epoch,
+ shuffle=True,
+ phase="train")
+
+ num_train_examples = reader.get_num_examples(args.train_set)
+
+ if args.in_tokens:
+ max_train_steps = args.epoch * num_train_examples // (
+ args.batch_size // args.max_seq_len) // dev_count
+ else:
+ max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
+
+ warmup_steps = int(max_train_steps * args.warmup_proportion)
+ print("Device count: %d" % dev_count)
+ print("Num train examples: %d" % num_train_examples)
+ print("Max train steps: %d" % max_train_steps)
+ print("Num warmup steps: %d" % warmup_steps)
+
+ train_program = fluid.Program()
+
+ with fluid.program_guard(train_program, startup_prog):
+ with fluid.unique_name.guard():
+ train_pyreader, graph_vars = create_model(
+ args,
+ pyreader_name='train_reader',
+ ernie_config=ernie_config)
+ scheduled_lr = optimization(
+ loss=graph_vars["loss"],
+ warmup_steps=warmup_steps,
+ num_train_steps=max_train_steps,
+ learning_rate=args.learning_rate,
+ train_program=train_program,
+ startup_prog=startup_prog,
+ weight_decay=args.weight_decay,
+ scheduler=args.lr_scheduler,
+ use_fp16=args.use_fp16,
+ loss_scaling=args.loss_scaling)
+
+ fluid.memory_optimize(
+ input_program=train_program,
+ skip_opt_set=[graph_vars["loss"].name,
+ graph_vars["labels"].name,
+ graph_vars["infers"].name,
+ graph_vars["seq_lens"].name
+ ])
+
+ if args.verbose:
+ if args.in_tokens:
+ lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+ program=train_program,
+ batch_size=args.batch_size // args.max_seq_len)
+ else:
+ lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
+ program=train_program, batch_size=args.batch_size)
+ print("Theoretical memory usage in training: %.3f - %.3f %s" %
+ (lower_mem, upper_mem, unit))
+
+ if args.do_val or args.do_test:
+ test_prog = fluid.Program()
+ with fluid.program_guard(test_prog, startup_prog):
+ with fluid.unique_name.guard():
+ test_pyreader, graph_vars = create_model(
+ args,
+ pyreader_name='test_reader',
+ ernie_config=ernie_config)
+
+ test_prog = test_prog.clone(for_test=True)
+
+ exe.run(startup_prog)
+
+ if args.do_train:
+ if args.init_checkpoint and args.init_pretraining_params:
+ print(
+ "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
+ "both are set! Only arg 'init_checkpoint' is made valid.")
+ if args.init_checkpoint:
+ init_checkpoint(
+ exe,
+ args.init_checkpoint,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+ elif args.init_pretraining_params:
+ init_pretraining_params(
+ exe,
+ args.init_pretraining_params,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+ elif args.do_val or args.do_test:
+ if not args.init_checkpoint:
+ raise ValueError("args 'init_checkpoint' should be set if"
+ "only doing validation or testing!")
+ init_checkpoint(
+ exe,
+ args.init_checkpoint,
+ main_program=startup_prog,
+ use_fp16=args.use_fp16)
+
+ if args.do_train:
+ exec_strategy = fluid.ExecutionStrategy()
+ if args.use_fast_executor:
+ exec_strategy.use_experimental_executor = True
+ exec_strategy.num_threads = dev_count
+ exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
+
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ loss_name=graph_vars["loss"].name,
+ exec_strategy=exec_strategy,
+ main_program=train_program)
+
+ train_pyreader.decorate_tensor_provider(train_data_generator)
+ else:
+ train_exe = None
+
+ if args.do_val or args.do_test:
+ test_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ main_program=test_prog,
+ share_vars_from=train_exe)
+
+ if args.do_train:
+ train_pyreader.start()
+ steps = 0
+ if warmup_steps > 0:
+ graph_vars["learning_rate"] = scheduled_lr
+
+ time_begin = time.time()
+ while True:
+ try:
+ steps += 1
+ if steps % args.skip_steps != 0:
+ train_exe.run(fetch_list=[])
+ else:
+ outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count)
+ if args.verbose:
+ verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+ )
+ verbose += "learning rate: %f" % (
+ outputs["lr"] if warmup_steps > 0 else args.learning_rate)
+ print(verbose)
+
+ current_example, current_epoch = reader.get_train_progress(
+ )
+ time_end = time.time()
+ used_time = time_end - time_begin
+ print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+ "f1: %f, precision: %f, recall: %f, speed: %f steps/s" %
+ (current_epoch, current_example, num_train_examples,
+ steps, outputs["loss"], outputs["f1"],
+ outputs["precision"], outputs["recall"],
+ args.skip_steps / used_time))
+ time_begin = time.time()
+
+ if steps % args.save_steps == 0:
+ save_path = os.path.join(args.checkpoints,
+ "step_" + str(steps))
+ fluid.io.save_persistables(exe, save_path, train_program)
+
+ if steps % args.validation_steps == 0:
+ # evaluate dev set
+ if args.do_val:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.dev_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev")
+ # evaluate test set
+ if args.do_test:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.test_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test")
+
+ except fluid.core.EOFException:
+ save_path = os.path.join(args.checkpoints, "step_" + str(steps))
+ fluid.io.save_persistables(exe, save_path, train_program)
+ train_pyreader.reset()
+ break
+
+ # final eval on dev set
+ if args.do_val:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.dev_set, batch_size=args.batch_size, epoch=1,
+ shuffle=False))
+ print("Final validation result:")
+ evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev")
+
+ # final eval on test set
+ if args.do_test:
+ test_pyreader.decorate_tensor_provider(
+ reader.data_generator(
+ args.test_set,
+ batch_size=args.batch_size,
+ epoch=1,
+ shuffle=False))
+ print("Final test result:")
+ evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test")
+
+
+if __name__ == '__main__':
+ print_arguments(args)
+ main(args)
diff --git a/ERNIE/tokenization.py b/ERNIE/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..f906b537177dce430888fbc1738cd4b33906d705
--- /dev/null
+++ b/ERNIE/tokenization.py
@@ -0,0 +1,370 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+
+
+def convert_to_unicode(text):
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+ if six.PY3:
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
+ return text.decode("utf-8", "ignore")
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ elif six.PY2:
+ if isinstance(text, str):
+ return text.decode("utf-8", "ignore")
+ elif isinstance(text, unicode):
+ return text
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ else:
+ raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+ """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+ # These functions want `str` for both Python2 and Python3, but in one case
+ # it's a Unicode string and in the other it's a byte string.
+ if six.PY3:
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
+ return text.decode("utf-8", "ignore")
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ elif six.PY2:
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, unicode):
+ return text.encode("utf-8")
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
+ else:
+ raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+ """Loads a vocabulary file into a dictionary."""
+ vocab = collections.OrderedDict()
+ fin = open(vocab_file)
+ for num, line in enumerate(fin):
+ items = convert_to_unicode(line.strip()).split("\t")
+ if len(items) > 2:
+ break
+ token = items[0]
+ index = items[1] if len(items) == 2 else num
+ token = token.strip()
+ vocab[token] = int(index)
+ return vocab
+
+
+def convert_by_vocab(vocab, items):
+ """Converts a sequence of [tokens|ids] using the vocab."""
+ output = []
+ for item in items:
+ output.append(vocab[item])
+ return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+ return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+ return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+ """Runs basic whitespace cleaning and splitting on a peice of text."""
+ text = text.strip()
+ if not text:
+ return []
+ tokens = text.split()
+ return tokens
+
+
+class FullTokenizer(object):
+ """Runs end-to-end tokenziation."""
+
+ def __init__(self, vocab_file, do_lower_case=True):
+ self.vocab = load_vocab(vocab_file)
+ self.inv_vocab = {v: k for k, v in self.vocab.items()}
+ self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+ def tokenize(self, text):
+ split_tokens = []
+ for token in self.basic_tokenizer.tokenize(text):
+ for sub_token in self.wordpiece_tokenizer.tokenize(token):
+ split_tokens.append(sub_token)
+
+ return split_tokens
+
+ def convert_tokens_to_ids(self, tokens):
+ return convert_by_vocab(self.vocab, tokens)
+
+ def convert_ids_to_tokens(self, ids):
+ return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+ """Runs end-to-end tokenziation."""
+
+ def __init__(self, vocab_file, do_lower_case=True):
+ self.vocab = load_vocab(vocab_file)
+ self.inv_vocab = {v: k for k, v in self.vocab.items()}
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+ def tokenize(self, text):
+ split_tokens = []
+ for token in text.lower().split(" "):
+ for sub_token in self.wordpiece_tokenizer.tokenize(token):
+ split_tokens.append(sub_token)
+
+ return split_tokens
+
+ def convert_tokens_to_ids(self, tokens):
+ return convert_by_vocab(self.vocab, tokens)
+
+ def convert_ids_to_tokens(self, ids):
+ return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+ """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+ def __init__(self, do_lower_case=True):
+ """Constructs a BasicTokenizer.
+
+ Args:
+ do_lower_case: Whether to lower case the input.
+ """
+ self.do_lower_case = do_lower_case
+
+ def tokenize(self, text):
+ """Tokenizes a piece of text."""
+ text = convert_to_unicode(text)
+ text = self._clean_text(text)
+
+ # This was added on November 1st, 2018 for the multilingual and Chinese
+ # models. This is also applied to the English models now, but it doesn't
+ # matter since the English models were not trained on any Chinese data
+ # and generally don't have any Chinese data in them (there are Chinese
+ # characters in the vocabulary because Wikipedia does have some Chinese
+ # words in the English Wikipedia.).
+ text = self._tokenize_chinese_chars(text)
+
+ orig_tokens = whitespace_tokenize(text)
+ split_tokens = []
+ for token in orig_tokens:
+ if self.do_lower_case:
+ token = token.lower()
+ token = self._run_strip_accents(token)
+ split_tokens.extend(self._run_split_on_punc(token))
+
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
+ return output_tokens
+
+ def _run_strip_accents(self, text):
+ """Strips accents from a piece of text."""
+ text = unicodedata.normalize("NFD", text)
+ output = []
+ for char in text:
+ cat = unicodedata.category(char)
+ if cat == "Mn":
+ continue
+ output.append(char)
+ return "".join(output)
+
+ def _run_split_on_punc(self, text):
+ """Splits punctuation on a piece of text."""
+ chars = list(text)
+ i = 0
+ start_new_word = True
+ output = []
+ while i < len(chars):
+ char = chars[i]
+ if _is_punctuation(char):
+ output.append([char])
+ start_new_word = True
+ else:
+ if start_new_word:
+ output.append([])
+ start_new_word = False
+ output[-1].append(char)
+ i += 1
+
+ return ["".join(x) for x in output]
+
+ def _tokenize_chinese_chars(self, text):
+ """Adds whitespace around any CJK character."""
+ output = []
+ for char in text:
+ cp = ord(char)
+ if self._is_chinese_char(cp):
+ output.append(" ")
+ output.append(char)
+ output.append(" ")
+ else:
+ output.append(char)
+ return "".join(output)
+
+ def _is_chinese_char(self, cp):
+ """Checks whether CP is the codepoint of a CJK character."""
+ # This defines a "chinese character" as anything in the CJK Unicode block:
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+ #
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+ # despite its name. The modern Korean Hangul alphabet is a different block,
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+ # space-separated words, so they are not treated specially and handled
+ # like the all of the other languages.
+ if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
+ (cp >= 0x3400 and cp <= 0x4DBF) or #
+ (cp >= 0x20000 and cp <= 0x2A6DF) or #
+ (cp >= 0x2A700 and cp <= 0x2B73F) or #
+ (cp >= 0x2B740 and cp <= 0x2B81F) or #
+ (cp >= 0x2B820 and cp <= 0x2CEAF) or
+ (cp >= 0xF900 and cp <= 0xFAFF) or #
+ (cp >= 0x2F800 and cp <= 0x2FA1F)): #
+ return True
+
+ return False
+
+ def _clean_text(self, text):
+ """Performs invalid character removal and whitespace cleanup on text."""
+ output = []
+ for char in text:
+ cp = ord(char)
+ if cp == 0 or cp == 0xfffd or _is_control(char):
+ continue
+ if _is_whitespace(char):
+ output.append(" ")
+ else:
+ output.append(char)
+ return "".join(output)
+
+
+class WordpieceTokenizer(object):
+ """Runs WordPiece tokenziation."""
+
+ def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+ self.vocab = vocab
+ self.unk_token = unk_token
+ self.max_input_chars_per_word = max_input_chars_per_word
+
+ def tokenize(self, text):
+ """Tokenizes a piece of text into its word pieces.
+
+ This uses a greedy longest-match-first algorithm to perform tokenization
+ using the given vocabulary.
+
+ For example:
+ input = "unaffable"
+ output = ["un", "##aff", "##able"]
+
+ Args:
+ text: A single token or whitespace separated tokens. This should have
+ already been passed through `BasicTokenizer.
+
+ Returns:
+ A list of wordpiece tokens.
+ """
+
+ text = convert_to_unicode(text)
+
+ output_tokens = []
+ for token in whitespace_tokenize(text):
+ chars = list(token)
+ if len(chars) > self.max_input_chars_per_word:
+ output_tokens.append(self.unk_token)
+ continue
+
+ is_bad = False
+ start = 0
+ sub_tokens = []
+ while start < len(chars):
+ end = len(chars)
+ cur_substr = None
+ while start < end:
+ substr = "".join(chars[start:end])
+ if start > 0:
+ substr = "##" + substr
+ if substr in self.vocab:
+ cur_substr = substr
+ break
+ end -= 1
+ if cur_substr is None:
+ is_bad = True
+ break
+ sub_tokens.append(cur_substr)
+ start = end
+
+ if is_bad:
+ output_tokens.append(self.unk_token)
+ else:
+ output_tokens.extend(sub_tokens)
+ return output_tokens
+
+
+def _is_whitespace(char):
+ """Checks whether `chars` is a whitespace character."""
+ # \t, \n, and \r are technically contorl characters but we treat them
+ # as whitespace since they are generally considered as such.
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
+ return True
+ cat = unicodedata.category(char)
+ if cat == "Zs":
+ return True
+ return False
+
+
+def _is_control(char):
+ """Checks whether `chars` is a control character."""
+ # These are technically control characters but we count them as whitespace
+ # characters.
+ if char == "\t" or char == "\n" or char == "\r":
+ return False
+ cat = unicodedata.category(char)
+ if cat.startswith("C"):
+ return True
+ return False
+
+
+def _is_punctuation(char):
+ """Checks whether `chars` is a punctuation character."""
+ cp = ord(char)
+ # We treat all non-letter/number ASCII as punctuation.
+ # Characters such as "^", "$", and "`" are not in the Unicode
+ # Punctuation class but we treat them as punctuation anyways, for
+ # consistency.
+ if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+ (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+ return True
+ cat = unicodedata.category(char)
+ if cat.startswith("P"):
+ return True
+ return False
diff --git a/ERNIE/train.py b/ERNIE/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..470d34bc94a01d16629784019ed2562bc86b6e82
--- /dev/null
+++ b/ERNIE/train.py
@@ -0,0 +1,360 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ERNIE pretraining."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+
+import paddle
+import paddle.fluid as fluid
+
+from reader.pretraining import ErnieDataReader
+from model.ernie import ErnieModel, ErnieConfig
+from optimization import optimization
+from utils.args import ArgumentGroup, print_arguments
+from utils.init import init_checkpoint, init_pretraining_params
+
+from pretrain_args import parser
+
+args = parser.parse_args()
+# yapf: enable.
+
+def create_model(pyreader_name, ernie_config):
+ pyreader = fluid.layers.py_reader(
+ capacity=70,
+ shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+ [-1, args.max_seq_len, 1],
+ [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1],
+ [-1, 1], [-1, 1]],
+ dtypes=[
+ 'int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64',
+ 'int64'
+ ],
+ lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
+ name=pyreader_name,
+ use_double_buffer=True)
+
+ (src_ids, pos_ids, sent_ids, self_attn_mask, mask_label, mask_pos, labels,
+ next_sent_index) = fluid.layers.read_file(pyreader)
+
+ ernie = ErnieModel(
+ src_ids=src_ids,
+ position_ids=pos_ids,
+ sentence_ids=sent_ids,
+ self_attn_mask=self_attn_mask,
+ config=ernie_config,
+ weight_sharing=args.weight_sharing,
+ use_fp16=args.use_fp16)
+
+ next_sent_acc, mask_lm_loss, total_loss = ernie.get_pretraining_output(
+ mask_label, mask_pos, labels, next_sent_index)
+
+ if args.use_fp16 and args.loss_scaling > 1.0:
+ total_loss *= args.loss_scaling
+
+ return pyreader, next_sent_acc, mask_lm_loss, total_loss
+
+
+def predict_wrapper(args,
+ exe,
+ ernie_config,
+ test_prog=None,
+ pyreader=None,
+ fetch_list=None):
+ # Context to do validation.
+ filelist = args.test_filelist if args.do_test else args.valid_filelist
+ data_reader = ErnieDataReader(
+ filelist,
+ vocab_path=args.vocab_path,
+ batch_size=args.batch_size,
+ voc_size=ernie_config['vocab_size'],
+ shuffle_files=False,
+ epoch=1,
+ max_seq_len=args.max_seq_len,
+ is_test=True)
+
+ if args.do_test:
+ assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \
+ to specify you pretrained model checkpoints"
+
+ init_pretraining_params(exe, args.init_checkpoint, test_prog)
+
+ def predict(exe=exe, pyreader=pyreader):
+
+ pyreader.decorate_tensor_provider(data_reader.data_generator())
+ pyreader.start()
+
+ cost = 0
+ lm_cost = 0
+ acc = 0
+ steps = 0
+ time_begin = time.time()
+ while True:
+ try:
+ each_next_acc, each_mask_lm_cost, each_total_cost = exe.run(
+ fetch_list=fetch_list, program=test_prog)
+ acc += each_next_acc
+ lm_cost += each_mask_lm_cost
+ cost += each_total_cost
+ steps += 1
+ if args.do_test and steps % args.skip_steps == 0:
+ print("[test_set] steps: %d" % steps)
+
+ except fluid.core.EOFException:
+ pyreader.reset()
+ break
+
+ used_time = time.time() - time_begin
+ return cost, lm_cost, acc, steps, (args.skip_steps / used_time)
+
+ return predict
+
+
+def test(args):
+ ernie_config = ErnieConfig(args.ernie_config_path)
+ ernie_config.print_config()
+
+ test_prog = fluid.Program()
+ test_startup = fluid.Program()
+ with fluid.program_guard(test_prog, test_startup):
+ with fluid.unique_name.guard():
+ test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
+ pyreader_name='test_reader', ernie_config=ernie_config)
+
+ test_prog = test_prog.clone(for_test=True)
+
+ place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
+ exe = fluid.Executor(place)
+ exe.run(test_startup)
+
+ predict = predict_wrapper(
+ args,
+ exe,
+ ernie_config,
+ test_prog=test_prog,
+ pyreader=test_pyreader,
+ fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name])
+
+ print("test begin")
+ loss, lm_loss, acc, steps, speed = predict()
+ print(
+ "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s"
+ % (np.mean(np.array(loss) / steps),
+ np.exp(np.mean(np.array(lm_loss) / steps)),
+ np.mean(np.array(acc) / steps), speed))
+
+
+def train(args):
+ print("pretraining start")
+ ernie_config = ErnieConfig(args.ernie_config_path)
+ ernie_config.print_config()
+
+ train_program = fluid.Program()
+ startup_prog = fluid.Program()
+ with fluid.program_guard(train_program, startup_prog):
+ with fluid.unique_name.guard():
+ train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
+ pyreader_name='train_reader', ernie_config=ernie_config)
+ scheduled_lr = optimization(
+ loss=total_loss,
+ warmup_steps=args.warmup_steps,
+ num_train_steps=args.num_train_steps,
+ learning_rate=args.learning_rate,
+ train_program=train_program,
+ startup_prog=startup_prog,
+ weight_decay=args.weight_decay,
+ scheduler=args.lr_scheduler,
+ use_fp16=args.use_fp16,
+ loss_scaling=args.loss_scaling)
+
+ fluid.memory_optimize(
+ input_program=train_program,
+ skip_opt_set=[
+ next_sent_acc.name, mask_lm_loss.name, total_loss.name
+ ])
+
+ test_prog = fluid.Program()
+ with fluid.program_guard(test_prog, startup_prog):
+ with fluid.unique_name.guard():
+ test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
+ pyreader_name='test_reader', ernie_config=ernie_config)
+
+ test_prog = test_prog.clone(for_test=True)
+
+ if args.use_cuda:
+ place = fluid.CUDAPlace(0)
+ dev_count = fluid.core.get_cuda_device_count()
+ else:
+ place = fluid.CPUPlace()
+ dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+ print("Device count %d" % dev_count)
+ print("theoretical memory usage: ")
+ print(fluid.contrib.memory_usage(
+ program=train_program, batch_size=args.batch_size // args.max_seq_len))
+
+ nccl2_num_trainers = 1
+ nccl2_trainer_id = 0
+ print("args.is_distributed:", args.is_distributed)
+ if args.is_distributed:
+ worker_endpoints_env = os.getenv("worker_endpoints")
+ worker_endpoints = worker_endpoints_env.split(",")
+ trainers_num = len(worker_endpoints)
+ current_endpoint = os.getenv("current_endpoint")
+ trainer_id = worker_endpoints.index(current_endpoint)
+ if trainer_id == 0:
+ print("train_id == 0, sleep 60s")
+ time.sleep(60)
+ print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
+ trainer_id:{}"
+ .format(worker_endpoints, trainers_num,
+ current_endpoint, trainer_id))
+
+ # prepare nccl2 env.
+ config = fluid.DistributeTranspilerConfig()
+ config.mode = "nccl2"
+ t = fluid.DistributeTranspiler(config=config)
+ t.transpile(
+ trainer_id,
+ trainers=worker_endpoints_env,
+ current_endpoint=current_endpoint,
+ program=train_program,
+ startup_program=startup_prog)
+ nccl2_num_trainers = trainers_num
+ nccl2_trainer_id = trainer_id
+
+ exe = fluid.Executor(place)
+ exe.run(startup_prog)
+
+ if args.init_checkpoint and args.init_checkpoint != "":
+ init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16)
+
+ data_reader = ErnieDataReader(
+ filelist=args.train_filelist,
+ batch_size=args.batch_size,
+ vocab_path=args.vocab_path,
+ voc_size=ernie_config['vocab_size'],
+ epoch=args.epoch,
+ max_seq_len=args.max_seq_len,
+ generate_neg_sample=args.generate_neg_sample)
+
+ exec_strategy = fluid.ExecutionStrategy()
+ if args.use_fast_executor:
+ exec_strategy.use_experimental_executor = True
+ exec_strategy.num_threads = dev_count
+ exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps)
+
+ build_strategy = fluid.BuildStrategy()
+ build_strategy.remove_unnecessary_lock = False
+
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ loss_name=total_loss.name,
+ build_strategy=build_strategy,
+ exec_strategy=exec_strategy,
+ main_program=train_program,
+ num_trainers=nccl2_num_trainers,
+ trainer_id=nccl2_trainer_id)
+
+ if args.valid_filelist and args.valid_filelist != "":
+ predict = predict_wrapper(
+ args,
+ exe,
+ ernie_config,
+ test_prog=test_prog,
+ pyreader=test_pyreader,
+ fetch_list=[
+ next_sent_acc.name, mask_lm_loss.name, total_loss.name
+ ])
+
+ train_pyreader.decorate_tensor_provider(data_reader.data_generator())
+ train_pyreader.start()
+ steps = 0
+ cost = []
+ lm_cost = []
+ acc = []
+ time_begin = time.time()
+ while steps < args.num_train_steps:
+ try:
+ steps += nccl2_num_trainers
+ skip_steps = args.skip_steps * nccl2_num_trainers
+
+ if nccl2_trainer_id != 0:
+ train_exe.run(fetch_list=[])
+ continue
+
+ if steps % skip_steps != 0:
+ train_exe.run(fetch_list=[])
+ else:
+ each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
+ fetch_list=[
+ next_sent_acc.name, mask_lm_loss.name, total_loss.name,
+ scheduled_lr.name
+ ])
+ acc.extend(each_next_acc)
+ lm_cost.extend(each_mask_lm_cost)
+ cost.extend(each_total_cost)
+
+ print("feed_queue size", train_pyreader.queue.size())
+ time_end = time.time()
+ used_time = time_end - time_begin
+ epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
+ )
+ print("current learning_rate:%f" % np_lr[0])
+ print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+ "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
+ % (epoch, current_file_index, total_file, steps,
+ np.mean(np.array(cost)),
+ np.mean(np.exp(np.array(lm_cost))),
+ np.mean(np.array(acc)), skip_steps / used_time,
+ current_file, mask_type))
+ cost = []
+ lm_cost = []
+ acc = []
+ time_begin = time.time()
+
+ if steps % args.save_steps == 0:
+ save_path = os.path.join(args.checkpoints, "step_" + str(steps))
+ fluid.io.save_persistables(exe, save_path, train_program)
+
+ if args.valid_filelist and steps % args.validation_steps == 0:
+ vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
+ )
+ print("[validation_set] epoch: %d, step: %d, "
+ "loss: %f, global ppl: %f, batch-averged ppl: %f, "
+ "next_sent_acc: %f, speed: %f steps/s" %
+ (epoch, steps,
+ np.mean(np.array(vali_cost) / vali_steps),
+ np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
+ np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
+ np.mean(np.array(vali_acc) / vali_steps), vali_speed))
+
+ except fluid.core.EOFException:
+ train_pyreader.reset()
+ break
+
+
+if __name__ == '__main__':
+ print_arguments(args)
+ if args.do_test:
+ test(args)
+ else:
+ train(args)
diff --git a/ERNIE/utils/__init__.py b/ERNIE/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ERNIE/utils/args.py b/ERNIE/utils/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9be634f0f383db61eb667df2345a89262179fd8
--- /dev/null
+++ b/ERNIE/utils/args.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import argparse
+
+
+def str2bool(v):
+ # because argparse does not support to parse "true, False" as python
+ # boolean directly
+ return v.lower() in ("true", "t", "1")
+
+
+class ArgumentGroup(object):
+ def __init__(self, parser, title, des):
+ self._group = parser.add_argument_group(title=title, description=des)
+
+ def add_arg(self, name, type, default, help, **kwargs):
+ type = str2bool if type == bool else type
+ self._group.add_argument(
+ "--" + name,
+ default=default,
+ type=type,
+ help=help + ' Default: %(default)s.',
+ **kwargs)
+
+
+def print_arguments(args):
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(six.iteritems(vars(args))):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
diff --git a/ERNIE/utils/fp16.py b/ERNIE/utils/fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153c2b9a1029897def264278c5dbe72e1f369f5
--- /dev/null
+++ b/ERNIE/utils/fp16.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+ prog.global_block().append_op(
+ type="cast",
+ inputs={"X": i},
+ outputs={"Out": o},
+ attrs={
+ "in_dtype": fluid.core.VarDesc.VarType.FP16,
+ "out_dtype": fluid.core.VarDesc.VarType.FP32
+ })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+ prog.global_block().append_op(
+ type="cast",
+ inputs={"X": i},
+ outputs={"Out": o},
+ attrs={
+ "in_dtype": fluid.core.VarDesc.VarType.FP32,
+ "out_dtype": fluid.core.VarDesc.VarType.FP16
+ })
+
+
+def copy_to_master_param(p, block):
+ v = block.vars.get(p.name, None)
+ if v is None:
+ raise ValueError("no param name %s found!" % p.name)
+ new_p = fluid.framework.Parameter(
+ block=block,
+ shape=v.shape,
+ dtype=fluid.core.VarDesc.VarType.FP32,
+ type=v.type,
+ lod_level=v.lod_level,
+ stop_gradient=p.stop_gradient,
+ trainable=p.trainable,
+ optimize_attr=p.optimize_attr,
+ regularizer=p.regularizer,
+ gradient_clip_attr=p.gradient_clip_attr,
+ error_clip=p.error_clip,
+ name=v.name + ".master")
+ return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+ loss_scaling):
+ master_params_grads = []
+ tmp_role = main_prog._current_role
+ OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+ main_prog._current_role = OpRole.Backward
+ for p, g in params_grads:
+ # create master parameters
+ master_param = copy_to_master_param(p, main_prog.global_block())
+ startup_master_param = startup_prog.global_block()._clone_variable(
+ master_param)
+ startup_p = startup_prog.global_block().var(p.name)
+ cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+ # cast fp16 gradients to fp32 before apply gradients
+ if g.name.find("layer_norm") > -1:
+ if loss_scaling > 1:
+ scaled_g = g / float(loss_scaling)
+ else:
+ scaled_g = g
+ master_params_grads.append([p, scaled_g])
+ continue
+ master_grad = fluid.layers.cast(g, "float32")
+ if loss_scaling > 1:
+ master_grad = master_grad / float(loss_scaling)
+ master_params_grads.append([master_param, master_grad])
+ main_prog._current_role = tmp_role
+ return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+ for idx, m_p_g in enumerate(master_params_grads):
+ train_p, _ = params_grads[idx]
+ if train_p.name.find("layer_norm") > -1:
+ continue
+ with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+ cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
diff --git a/ERNIE/utils/init.py b/ERNIE/utils/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..3844d01298ecbb70aed37b467aebca62caadd391
--- /dev/null
+++ b/ERNIE/utils/init.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+def cast_fp32_to_fp16(exe, main_program):
+ print("Cast parameters to float16 data format.")
+ for param in main_program.global_block().all_parameters():
+ if not param.name.endswith(".master"):
+ param_t = fluid.global_scope().find_var(param.name).get_tensor()
+ data = np.array(param_t)
+ if param.name.find("layer_norm") == -1:
+ param_t.set(np.float16(data).view(np.uint16), exe.place)
+ master_param_var = fluid.global_scope().find_var(param.name +
+ ".master")
+ if master_param_var is not None:
+ master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+ assert os.path.exists(
+ init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+ def existed_persitables(var):
+ if not fluid.io.is_persistable(var):
+ return False
+ return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+ fluid.io.load_vars(
+ exe,
+ init_checkpoint_path,
+ main_program=main_program,
+ predicate=existed_persitables)
+ print("Load model from {}".format(init_checkpoint_path))
+
+ if use_fp16:
+ cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+ pretraining_params_path,
+ main_program,
+ use_fp16=False):
+ assert os.path.exists(pretraining_params_path
+ ), "[%s] cann't be found." % pretraining_params_path
+
+ def existed_params(var):
+ if not isinstance(var, fluid.framework.Parameter):
+ return False
+ return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+ fluid.io.load_vars(
+ exe,
+ pretraining_params_path,
+ main_program=main_program,
+ predicate=existed_params)
+ print("Load pretraining parameters from {}.".format(
+ pretraining_params_path))
+
+ if use_fp16:
+ cast_fp32_to_fp16(exe, main_program)