未验证 提交 fbeac9b3 编写于 作者: T tianxin 提交者: GitHub

Merge pull request #64 from PaddlePaddle/upgrade_ernie

Simplify ernie model structure
......@@ -115,7 +115,7 @@ class BertModel(object):
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=1000.0, bias=-1.0, bias_after_scale=False)
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
......
......@@ -166,7 +166,7 @@ nlpcc-dbqa是由国际自然语言处理和中文计算会议NLPCC于2016年举
2) [任务数据下载](https://ernie.bj.bcebos.com/task_data.tgz)
### 安装
本项目依赖于 Paddle Fluid 1.3.0,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
本项目依赖于 Paddle Fluid 1.3.1,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
**Note**: 预训练任务和finetune任务测试机器为P40, 显存22G;如果显存低于22G, 某些任务可能会因显存不足报错;
......
......@@ -124,7 +124,7 @@ def prepare_batch_data(insts,
cls_id=None,
sep_id=None,
mask_id=None,
return_attn_bias=True,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
......@@ -149,14 +149,13 @@ def prepare_batch_data(insts,
MASK=mask_id)
# Second step: padding
src_id, next_sent_index, self_attn_bias = pad_batch_data(
out, pad_idx=pad_id, return_next_sent_pos=True, return_attn_bias=True)
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id)
sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id)
return_list = [
src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels,
next_sent_index
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos, labels
]
return return_list
......@@ -165,8 +164,7 @@ def prepare_batch_data(insts,
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_next_sent_pos=False,
return_attn_bias=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
"""
......@@ -182,15 +180,6 @@ def pad_batch_data(insts,
[inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# next_sent_pos for extract first token embedding of each sentence
if return_next_sent_pos:
batch_size = inst_data.shape[0]
max_seq_len = inst_data.shape[1]
next_sent_index = np.array(
range(0, batch_size * max_seq_len, max_seq_len)).astype(
"int64").reshape(-1, 1)
return_list += [next_sent_index]
# position data
if return_pos:
inst_pos = np.array([
......@@ -200,13 +189,12 @@ def pad_batch_data(insts,
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_attn_bias:
if return_input_mask:
# This is used to avoid attention on paddings.
slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
(max_len - len(inst)) for inst in insts])
slf_attn_bias_data = np.tile(
slf_attn_bias_data.reshape([-1, 1, max_len]), [1, max_len, 1])
return_list += [slf_attn_bias_data.astype("float32")]
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
......
......@@ -31,26 +31,25 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
[-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, sent_ids, pos_ids, self_attn_mask, labels, next_sent_index,
(src_ids, sent_ids, pos_ids, input_mask, labels,
qids) = fluid.layers.read_file(pyreader)
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
self_attn_mask=self_attn_mask,
input_mask=input_mask,
config=ernie_config,
use_fp16=args.use_fp16)
cls_feats = ernie.get_pooled_output(next_sent_index)
cls_feats = ernie.get_pooled_output()
cls_feats = fluid.layers.dropout(
x=cls_feats,
dropout_prob=0.1,
......@@ -67,8 +66,7 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
if is_prediction:
probs = fluid.layers.softmax(logits)
feed_targets_name = [
src_ids.name, pos_ids.name, sent_ids.name, self_attn_mask.name,
next_sent_index.name
src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
]
return pyreader, probs, feed_targets_name
......
......@@ -29,28 +29,26 @@ from six.moves import xrange
from model.ernie import ErnieModel
def create_model(args,
pyreader_name,
ernie_config,
is_prediction=False):
def create_model(args, pyreader_name, ernie_config, is_prediction=False):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, args.max_seq_len],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64'],
dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, sent_ids, pos_ids, self_attn_mask, labels,
(src_ids, sent_ids, pos_ids, input_mask, labels,
seq_lens) = fluid.layers.read_file(pyreader)
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
self_attn_mask=self_attn_mask,
input_mask=input_mask,
config=ernie_config,
use_fp16=args.use_fp16)
......@@ -63,33 +61,40 @@ def create_model(args,
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b", initializer=fluid.initializer.Constant(0.)))
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
ret_labels = fluid.layers.reshape(x=labels, shape=[-1,1])
ret_infers = fluid.layers.reshape(x=fluid.layers.argmax(logits, axis=2), shape=[-1,1])
ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(
logits, axis=2), shape=[-1, 1])
labels = fluid.layers.flatten(labels, axis=2)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=fluid.layers.flatten(logits, axis=2),
label=labels, return_softmax=True)
logits=fluid.layers.flatten(
logits, axis=2),
label=labels,
return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
if args.use_fp16 and args.loss_scaling > 1.0:
loss *= args.loss_scaling
graph_vars = {"loss": loss,
"probs": probs,
"labels": ret_labels,
"infers": ret_infers,
"seq_lens": seq_lens}
graph_vars = {
"loss": loss,
"probs": probs,
"labels": ret_labels,
"infers": ret_infers,
"seq_lens": seq_lens
}
for k, v in graph_vars.items():
v.persistable=True
v.persistable = True
return pyreader, graph_vars
def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
def extract_bio_chunk(seq):
chunks = []
cur_chunk = None
......@@ -109,18 +114,18 @@ def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
if cur_chunk is not None:
chunks.append(cur_chunk)
cur_chunk = {}
cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
else:
if cur_chunk is None:
cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
continue
if cur_chunk["type"] == tag_type:
cur_chunk["en"] = index + 1
cur_chunk["en"] = index + 1
else:
chunks.append(cur_chunk)
cur_chunk = {"st":index, "en": index + 1, "type": tag_type}
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
if cur_chunk is not None:
chunks.append(cur_chunk)
......@@ -151,14 +156,19 @@ def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
infer_index = 0
label_index = 0
while label_index < len(label_chunks) and infer_index < len(infer_chunks):
if infer_chunks[infer_index]["st"] < label_chunks[label_index]["st"]:
while label_index < len(label_chunks) \
and infer_index < len(infer_chunks):
if infer_chunks[infer_index]["st"] \
< label_chunks[label_index]["st"]:
infer_index += 1
elif infer_chunks[infer_index]["st"] > label_chunks[label_index]["st"]:
elif infer_chunks[infer_index]["st"] \
> label_chunks[label_index]["st"]:
label_index += 1
else:
if infer_chunks[infer_index]["en"] == label_chunks[label_index]["en"] and \
infer_chunks[infer_index]["type"] == label_chunks[label_index]["type"]:
if infer_chunks[infer_index]["en"] \
== label_chunks[label_index]["en"] \
and infer_chunks[infer_index]["type"] \
== label_chunks[label_index]["type"]:
num_correct += 1
infer_index += 1
......@@ -168,6 +178,7 @@ def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
return num_label, num_infer, num_correct
def calculate_f1(num_label, num_infer, num_correct):
if num_infer == 0:
precision = 0.0
......@@ -185,10 +196,18 @@ def calculate_f1(num_label, num_infer, num_correct):
f1 = 2 * precision * recall / (precision + recall)
return precision, recall, f1
def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=1):
fetch_list = [graph_vars["labels"].name,
graph_vars["infers"].name,
graph_vars["seq_lens"].name]
def evaluate(exe,
program,
pyreader,
graph_vars,
tag_num,
eval_phase,
dev_count=1):
fetch_list = [
graph_vars["labels"].name, graph_vars["infers"].name,
graph_vars["seq_lens"].name
]
if eval_phase == "train":
fetch_list.append(graph_vars["loss"].name)
......@@ -196,9 +215,15 @@ def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=
fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=fetch_list)
np_labels, np_infers, np_lens, np_loss = outputs[:4]
num_label, num_infer, num_correct = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count)
num_label, num_infer, num_correct = chunk_eval(
np_labels, np_infers, np_lens, tag_num, dev_count)
precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
outputs = {"precision": precision, "recall": recall, "f1": f1, "loss": np.mean(np_loss)}
outputs = {
"precision": precision,
"recall": recall,
"f1": f1,
"loss": np.mean(np_loss)
}
if "learning_rate" in graph_vars:
outputs["lr"] = float(outputs[4][0])
return outputs
......@@ -209,8 +234,10 @@ def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=
pyreader.start()
while True:
try:
np_labels, np_infers, np_lens = exe.run(program=program, fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count)
np_labels, np_infers, np_lens = exe.run(program=program,
fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, tag_num, dev_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
......@@ -219,8 +246,10 @@ def evaluate(exe, program, pyreader, graph_vars, tag_num, eval_phase, dev_count=
pyreader.reset()
break
precision, recall, f1 = calculate_f1(total_label, total_infer, total_correct)
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
time_end = time.time()
print("[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s" %
(eval_phase, f1, precision, recall, time_end - time_begin))
print(
"[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s"
% (eval_phase, f1, precision, recall, time_end - time_begin))
......@@ -52,7 +52,7 @@ class ErnieModel(object):
src_ids,
position_ids,
sentence_ids,
self_attn_mask,
input_mask,
config,
weight_sharing=True,
use_fp16=False):
......@@ -78,9 +78,9 @@ class ErnieModel(object):
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._build_model(src_ids, position_ids, sentence_ids, self_attn_mask)
self._build_model(src_ids, position_ids, sentence_ids, input_mask)
def _build_model(self, src_ids, position_ids, sentence_ids, self_attn_mask):
def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
......@@ -110,9 +110,12 @@ class ErnieModel(object):
emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
if self._dtype == "float16":
self_attn_mask = fluid.layers.cast(
x=self_attn_mask, dtype=self._dtype)
input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
......@@ -138,13 +141,10 @@ class ErnieModel(object):
def get_sequence_output(self):
return self._enc_out
def get_pooled_output(self, next_sent_index):
def get_pooled_output(self):
"""Get the first feature of each sequence for classification"""
self._reshaped_emb_out = fluid.layers.reshape(
x=self._enc_out, shape=[-1, self._emb_size], inplace=True)
next_sent_index = fluid.layers.cast(x=next_sent_index, dtype='int32')
next_sent_feat = fluid.layers.gather(
input=self._reshaped_emb_out, index=next_sent_index)
next_sent_feat = fluid.layers.slice(
input=self._enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
......@@ -154,17 +154,17 @@ class ErnieModel(object):
bias_attr="pooled_fc.b_0")
return next_sent_feat
def get_pretraining_output(self, mask_label, mask_pos, labels,
next_sent_index):
def get_pretraining_output(self, mask_label, mask_pos, labels):
"""Get the loss & accuracy for pretraining"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
# extract the first token feature in each sentence
next_sent_feat = self.get_pooled_output(next_sent_index)
next_sent_feat = self.get_pooled_output()
reshaped_emb_out = fluid.layers.reshape(
x=self._enc_out, shape=[-1, self._emb_size])
# extract masked tokens' feature
mask_feat = fluid.layers.gather(
input=self._reshaped_emb_out, index=mask_pos)
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
# transform: fc
mask_trans_feat = fluid.layers.fc(
......
......@@ -171,9 +171,12 @@ class ErnieDataReader(object):
if len(token_seq) > self.max_seq_len:
miss_num += 1
continue
type_seq = [0] * (len(left_tokens) + 2) + [1] * (len(right_tokens) + 1)
type_seq = [0] * (len(left_tokens) + 2) + [1] * (len(right_tokens) +
1)
pos_seq = range(len(token_seq))
seg_label_seq = [-1] + left_seg_labels + [-1] + right_seg_labels + [-1]
seg_label_seq = [-1] + left_seg_labels + [-1] + right_seg_labels + [
-1
]
assert len(token_seq) == len(type_seq) == len(pos_seq) == len(seg_label_seq), \
"[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
......@@ -290,7 +293,7 @@ class ErnieDataReader(object):
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=self.mask_id,
return_attn_bias=True,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
......
......@@ -247,11 +247,8 @@ class ClassifyReader(BaseReader):
batch_qids = np.array([]).astype("int64").reshape([-1, 1])
# padding
padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
return_next_sent_pos=True,
return_attn_bias=True)
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
......@@ -259,7 +256,7 @@ class ClassifyReader(BaseReader):
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, batch_labels, next_sent_index, batch_qids
input_mask, batch_labels, batch_qids
]
return return_list
......@@ -274,11 +271,8 @@ class SequenceLabelReader(BaseReader):
batch_seq_lens = [len(record.token_ids) for record in batch_records]
# padding
padded_token_ids, self_attn_bias = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
return_next_sent_pos=False,
return_attn_bias=True)
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
......@@ -290,7 +284,7 @@ class SequenceLabelReader(BaseReader):
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, padded_label_ids, batch_seq_lens
input_mask, padded_label_ids, batch_seq_lens
]
return return_list
......
......@@ -43,31 +43,29 @@ def create_model(pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader(
capacity=70,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
[-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64',
'int64'
'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, self_attn_mask, mask_label, mask_pos, labels,
next_sent_index) = fluid.layers.read_file(pyreader)
(src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos,
labels) = fluid.layers.read_file(pyreader)
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
self_attn_mask=self_attn_mask,
input_mask=input_mask,
config=ernie_config,
weight_sharing=args.weight_sharing,
use_fp16=args.use_fp16)
next_sent_acc, mask_lm_loss, total_loss = ernie.get_pretraining_output(
mask_label, mask_pos, labels, next_sent_index)
mask_label, mask_pos, labels)
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss *= args.loss_scaling
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册