提交 78489c4b 编写于 作者: T tianxin04

format

上级 354d97a8
...@@ -19,7 +19,15 @@ from __future__ import print_function ...@@ -19,7 +19,15 @@ from __future__ import print_function
import numpy as np import numpy as np
def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
def mask(batch_tokens,
seg_labels,
mask_word_tags,
total_token_num,
vocab_size,
CLS=1,
SEP=2,
MASK=3):
""" """
Add mask for batch_tokens, return out, mask_label, mask_pos; Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded; Note: mask_pos responding the batch_tokens after padded;
...@@ -90,7 +98,8 @@ def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, ...@@ -90,7 +98,8 @@ def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size,
# random replace # random replace
if token != SEP and token != CLS: if token != SEP and token != CLS:
mask_label.append(sent[token_index]) mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index] sent[token_index] = replace_ids[prob_index +
token_index]
mask_flag = True mask_flag = True
mask_pos.append(sent_index * max_len + token_index) mask_pos.append(sent_index * max_len + token_index)
else: else:
...@@ -143,7 +152,10 @@ def prepare_batch_data(insts, ...@@ -143,7 +152,10 @@ def prepare_batch_data(insts,
pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id)
sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id)
return_list = [src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, next_sent_index] return_list = [
src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels,
next_sent_index
]
return return_list return return_list
...@@ -207,4 +219,5 @@ def pad_batch_data(insts, ...@@ -207,4 +219,5 @@ def pad_batch_data(insts,
if __name__ == "__main__": if __name__ == "__main__":
pass pass
...@@ -25,22 +25,20 @@ import paddle.fluid as fluid ...@@ -25,22 +25,20 @@ import paddle.fluid as fluid
from model.ernie import ErnieModel from model.ernie import ErnieModel
def create_model(args, def create_model(args, pyreader_name, ernie_config, is_prediction=False):
pyreader_name,
ernie_config,
is_prediction=False):
pyreader = fluid.layers.py_reader( pyreader = fluid.layers.py_reader(
capacity=50, capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], [-1, 1]], [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1],
[-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'], dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0], lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name, name=pyreader_name,
use_double_buffer=True) use_double_buffer=True)
(src_ids, sent_ids, pos_ids, self_attn_mask, labels, (src_ids, sent_ids, pos_ids, self_attn_mask, labels, next_sent_index,
next_sent_index, qids) = fluid.layers.read_file(pyreader) qids) = fluid.layers.read_file(pyreader)
ernie = ErnieModel( ernie = ErnieModel(
src_ids=src_ids, src_ids=src_ids,
...@@ -57,7 +55,7 @@ def create_model(args, ...@@ -57,7 +55,7 @@ def create_model(args,
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=cls_feats, input=cls_feats,
size=ernie_config["num_labels"], size=args.num_labels,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="cls_out_w", name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)), initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
...@@ -82,18 +80,21 @@ def create_model(args, ...@@ -82,18 +80,21 @@ def create_model(args,
num_seqs = fluid.layers.create_tensor(dtype='int64') num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
graph_vars = {"loss": loss, graph_vars = {
"probs": probs, "loss": loss,
"accuracy": accuracy, "probs": probs,
"labels": labels, "accuracy": accuracy,
"num_seqs": num_seqs, "labels": labels,
"qids": qids} "num_seqs": num_seqs,
"qids": qids
}
for k, v in graph_vars.items(): for k, v in graph_vars.items():
v.persistable=True v.persistable = True
return pyreader, graph_vars return pyreader, graph_vars
def evaluate_mrr(preds): def evaluate_mrr(preds):
last_qid = None last_qid = None
total_mrr = 0.0 total_mrr = 0.0
...@@ -114,6 +115,7 @@ def evaluate_mrr(preds): ...@@ -114,6 +115,7 @@ def evaluate_mrr(preds):
return total_mrr / qnum return total_mrr / qnum
def evaluate_map(preds): def evaluate_map(preds):
def singe_map(st, en): def singe_map(st, en):
total_p = 0.0 total_p = 0.0
...@@ -142,17 +144,18 @@ def evaluate_map(preds): ...@@ -142,17 +144,18 @@ def evaluate_map(preds):
total_map += singe_map(st, len(preds)) total_map += singe_map(st, len(preds))
return total_map / qnum return total_map / qnum
def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
train_fetch_list = [graph_vars["loss"].name, train_fetch_list = [
graph_vars["accuracy"].name, graph_vars["loss"].name, graph_vars["accuracy"].name,
graph_vars["num_seqs"].name graph_vars["num_seqs"].name
] ]
if eval_phase == "train": if eval_phase == "train":
if "learning_rate" in graph_vars: if "learning_rate" in graph_vars:
train_fetch_list.append(graph_vars["learning_rate"].name) train_fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=train_fetch_list) outputs = exe.run(fetch_list=train_fetch_list)
ret = {"loss":np.mean(outputs[0]), "accuracy":np.mean(outputs[1])} ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])}
if "learning_rate" in graph_vars: if "learning_rate" in graph_vars:
ret["learning_rate"] = float(outputs[4][0]) ret["learning_rate"] = float(outputs[4][0])
return ret return ret
...@@ -162,22 +165,21 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): ...@@ -162,22 +165,21 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
qids, labels, scores = [], [], [] qids, labels, scores = [], [], []
time_begin = time.time() time_begin = time.time()
fetch_list = [graph_vars["loss"].name, fetch_list = [
graph_vars["accuracy"].name, graph_vars["loss"].name, graph_vars["accuracy"].name,
graph_vars["probs"].name, graph_vars["probs"].name, graph_vars["labels"].name,
graph_vars["labels"].name, graph_vars["num_seqs"].name, graph_vars["qids"].name
graph_vars["num_seqs"].name, ]
graph_vars["qids"].name]
while True: while True:
try: try:
np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(program=test_program, np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
fetch_list=fetch_list) program=test_program, fetch_list=fetch_list)
total_cost += np.sum(np_loss * np_num_seqs) total_cost += np.sum(np_loss * np_num_seqs)
total_acc += np.sum(np_acc * np_num_seqs) total_acc += np.sum(np_acc * np_num_seqs)
total_num_seqs += np.sum(np_num_seqs) total_num_seqs += np.sum(np_num_seqs)
labels.extend(np_labels.reshape((-1)).tolist()) labels.extend(np_labels.reshape((-1)).tolist())
qids.extend(np_qids.reshape(-1).tolist()) qids.extend(np_qids.reshape(-1).tolist())
scores.extend(np_probs[:,1].reshape(-1).tolist()) scores.extend(np_probs[:, 1].reshape(-1).tolist())
np_preds = np.argmax(np_probs, axis=1).astype(np.float32) np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
total_label_pos_num += np.sum(np_labels) total_label_pos_num += np.sum(np_labels)
total_pred_pos_num += np.sum(np_preds) total_pred_pos_num += np.sum(np_preds)
...@@ -188,20 +190,23 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): ...@@ -188,20 +190,23 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
time_end = time.time() time_end = time.time()
if len(qids) == 0: if len(qids) == 0:
print("[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" % print(
(eval_phase, total_cost / total_num_seqs, "[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
total_acc / total_num_seqs, total_num_seqs, time_end - time_begin)) % (eval_phase, total_cost / total_num_seqs, total_acc /
total_num_seqs, total_num_seqs, time_end - time_begin))
else: else:
r = total_correct_num / total_label_pos_num r = total_correct_num / total_label_pos_num
p = total_correct_num / total_pred_pos_num p = total_correct_num / total_pred_pos_num
f = 2 * p * r / (p + r) f = 2 * p * r / (p + r)
assert len(qids) == len(labels) == len(scores) assert len(qids) == len(labels) == len(scores)
preds = sorted(zip(qids, scores, labels), key=lambda elem:(elem[0], -elem[1])) preds = sorted(
zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
mrr = evaluate_mrr(preds) mrr = evaluate_mrr(preds)
map = evaluate_map(preds) map = evaluate_map(preds)
print("[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" % print(
(eval_phase, total_cost / total_num_seqs, "[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s"
total_acc / total_num_seqs, % (eval_phase, total_cost / total_num_seqs,
mrr, map, p, r, f, total_num_seqs, time_end - time_begin)) total_acc / total_num_seqs, mrr, map, p, r, f, total_num_seqs,
time_end - time_begin))
...@@ -64,7 +64,7 @@ data_g.add_arg("do_lower_case", bool, True, ...@@ -64,7 +64,7 @@ data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.") "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("random_seed", int, 0, "Random seed.") data_g.add_arg("random_seed", int, 0, "Random seed.")
data_g.add_arg("label_map_config", str, None, "label_map_path.") data_g.add_arg("label_map_config", str, None, "label_map_path.")
data_g.add_arg("num_labels", int, 2, "label number") data_g.add_arg("num_labels", int, 2, "label number")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
...@@ -74,3 +74,4 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe ...@@ -74,3 +74,4 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
# yapf: enable
...@@ -24,7 +24,6 @@ from utils.args import ArgumentGroup, print_arguments ...@@ -24,7 +24,6 @@ from utils.args import ArgumentGroup, print_arguments
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser(__doc__) parser = argparse.ArgumentParser(__doc__)
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.") model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.") model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
......
...@@ -30,6 +30,7 @@ import paddle.fluid as fluid ...@@ -30,6 +30,7 @@ import paddle.fluid as fluid
from batching import prepare_batch_data from batching import prepare_batch_data
class ErnieDataReader(object): class ErnieDataReader(object):
def __init__(self, def __init__(self,
filelist, filelist,
...@@ -81,8 +82,8 @@ class ErnieDataReader(object): ...@@ -81,8 +82,8 @@ class ErnieDataReader(object):
sent_ids = [int(token) for token in sent_ids.split(" ")] sent_ids = [int(token) for token in sent_ids.split(" ")]
pos_ids = [int(token) for token in pos_ids.split(" ")] pos_ids = [int(token) for token in pos_ids.split(" ")]
seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")] seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")]
assert len(token_ids) == len(sent_ids) == len( assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
pos_ids) == len(seg_labels seg_labels
), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
label = int(label) label = int(label)
if len(token_ids) > max_seq_len: if len(token_ids) > max_seq_len:
...@@ -153,14 +154,17 @@ class ErnieDataReader(object): ...@@ -153,14 +154,17 @@ class ErnieDataReader(object):
if left_len <= max_len: if left_len <= max_len:
return (token_seq[1:sep_index], seg_labels[1:sep_index]) return (token_seq[1:sep_index], seg_labels[1:sep_index])
else: else:
return [token_seq[sep_index + 1: -1], seg_labels[sep_index + 1 : -1]] return [
token_seq[sep_index + 1:-1], seg_labels[sep_index + 1:-1]
]
for i in range(num_sample): for i in range(num_sample):
pair_index = (i + 1) % num_sample pair_index = (i + 1) % num_sample
left_tokens, left_seg_labels = split_sent(pos_samples[i], left_tokens, left_seg_labels = split_sent(
(self.max_seq_len - 3) // 2, self.sep_id) pos_samples[i], (self.max_seq_len - 3) // 2, self.sep_id)
right_tokens, right_seg_labels = split_sent(pos_samples[pair_index], right_tokens, right_seg_labels = split_sent(
self.max_seq_len - 3 - len(left_tokens), self.sep_id) pos_samples[pair_index],
self.max_seq_len - 3 - len(left_tokens), self.sep_id)
token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \ token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \
right_tokens + [self.sep_id] right_tokens + [self.sep_id]
......
...@@ -62,7 +62,7 @@ class BaseReader(object): ...@@ -62,7 +62,7 @@ class BaseReader(object):
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
headers = next(reader) headers = next(reader)
Example = namedtuple('Example', headers) Example = namedtuple('Example', headers)
examples = [] examples = []
for line in reader: for line in reader:
example = Example(*line) example = Example(*line)
...@@ -85,7 +85,7 @@ class BaseReader(object): ...@@ -85,7 +85,7 @@ class BaseReader(object):
else: else:
tokens_b.pop() tokens_b.pop()
def _convert_example_to_record(self, example, max_seq_length, tokenizer): def _convert_example_to_record(self, example, max_seq_length, tokenizer):
"""Converts a single `Example` into a single `Record`.""" """Converts a single `Example` into a single `Record`."""
text_a = tokenization.convert_to_unicode(example.text_a) text_a = tokenization.convert_to_unicode(example.text_a)
...@@ -148,7 +148,9 @@ class BaseReader(object): ...@@ -148,7 +148,9 @@ class BaseReader(object):
else: else:
label_id = example.label label_id = example.label
Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
qid = None qid = None
if "qid" in example._fields: if "qid" in example._fields:
...@@ -164,11 +166,12 @@ class BaseReader(object): ...@@ -164,11 +166,12 @@ class BaseReader(object):
def _prepare_batch_data(self, examples, batch_size, phase=None): def _prepare_batch_data(self, examples, batch_size, phase=None):
"""generate batch records""" """generate batch records"""
batch_records, max_len = [], 0 batch_records, max_len = [], 0
for index, example in enumerate(examples): for index, example in enumerate(examples):
if phase == "train": if phase == "train":
self.current_example = index self.current_example = index
record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer) record = self._convert_example_to_record(example, self.max_seq_len,
self.tokenizer)
max_len = max(max_len, len(record.token_ids)) max_len = max(max_len, len(record.token_ids))
if self.in_tokens: if self.in_tokens:
to_append = (len(batch_records) + 1) * max_len <= batch_size to_append = (len(batch_records) + 1) * max_len <= batch_size
...@@ -187,7 +190,12 @@ class BaseReader(object): ...@@ -187,7 +190,12 @@ class BaseReader(object):
examples = self._read_tsv(input_file) examples = self._read_tsv(input_file)
return len(examples) return len(examples)
def data_generator(self, input_file, batch_size, epoch, shuffle=True, phase=None): def data_generator(self,
input_file,
batch_size,
epoch,
shuffle=True,
phase=None):
examples = self._read_tsv(input_file) examples = self._read_tsv(input_file)
def wrapper(): def wrapper():
...@@ -198,8 +206,10 @@ class BaseReader(object): ...@@ -198,8 +206,10 @@ class BaseReader(object):
if shuffle: if shuffle:
np.random.shuffle(examples) np.random.shuffle(examples)
for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase): for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield batch_data yield batch_data
return wrapper return wrapper
...@@ -209,9 +219,11 @@ class ClassifyReader(BaseReader): ...@@ -209,9 +219,11 @@ class ClassifyReader(BaseReader):
with open(input_file, "r") as f: with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
headers = next(reader) headers = next(reader)
text_indices = [index for index, h in enumerate(headers) if h != "label"] text_indices = [
index for index, h in enumerate(headers) if h != "label"
]
Example = namedtuple('Example', headers) Example = namedtuple('Example', headers)
examples = [] examples = []
for line in reader: for line in reader:
for index, text in enumerate(line): for index, text in enumerate(line):
...@@ -219,8 +231,8 @@ class ClassifyReader(BaseReader): ...@@ -219,8 +231,8 @@ class ClassifyReader(BaseReader):
line[index] = text.replace(' ', '') line[index] = text.replace(' ', '')
example = Example(*line) example = Example(*line)
examples.append(example) examples.append(example)
return examples return examples
def _pad_batch_records(self, batch_records): def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records] batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records]
...@@ -236,33 +248,50 @@ class ClassifyReader(BaseReader): ...@@ -236,33 +248,50 @@ class ClassifyReader(BaseReader):
# padding # padding
padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data( padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True) batch_token_ids,
padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) pad_idx=self.pad_id,
padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) return_next_sent_pos=True,
return_attn_bias=True)
return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids] padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, batch_labels, next_sent_index, batch_qids
]
return return_list return return_list
class SequenceLabelReader(BaseReader): class SequenceLabelReader(BaseReader):
def _pad_batch_records(self, batch_records): def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records] batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records]
batch_label_ids = [record.label_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records]
batch_seq_lens = [len(record.token_ids) for record in batch_records] batch_seq_lens = [len(record.token_ids) for record in batch_records]
# padding # padding
padded_token_ids, self_attn_bias = pad_batch_data( padded_token_ids, self_attn_bias = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True) batch_token_ids,
padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) pad_idx=self.pad_id,
padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) return_next_sent_pos=False,
padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map)-1) return_attn_bias=True)
batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape([-1, 1]) padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens] padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(
batch_label_ids, pad_idx=len(self.label_map) - 1)
batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape(
[-1, 1])
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, padded_label_ids, batch_seq_lens
]
return return_list return return_list
def _reseg_token_label(self, tokens, labels, tokenizer): def _reseg_token_label(self, tokens, labels, tokenizer):
...@@ -285,7 +314,7 @@ class SequenceLabelReader(BaseReader): ...@@ -285,7 +314,7 @@ class SequenceLabelReader(BaseReader):
assert len(ret_tokens) == len(ret_labels) assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels return ret_tokens, ret_labels
def _convert_example_to_record(self, example, max_seq_length, tokenizer): def _convert_example_to_record(self, example, max_seq_length, tokenizer):
tokens = tokenization.convert_to_unicode(example.text_a).split(u"") tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
labels = tokenization.convert_to_unicode(example.label).split(u"") labels = tokenization.convert_to_unicode(example.label).split(u"")
tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
...@@ -297,11 +326,15 @@ class SequenceLabelReader(BaseReader): ...@@ -297,11 +326,15 @@ class SequenceLabelReader(BaseReader):
tokens = ["[CLS]"] + tokens + ["[SEP]"] tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = tokenizer.convert_tokens_to_ids(tokens) token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids))) position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids) text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1 no_entity_id = len(self.label_map) - 1
label_ids = [no_entity_id] + [self.label_map[label] for label in labels] + [no_entity_id] label_ids = [no_entity_id] + [
self.label_map[label] for label in labels
] + [no_entity_id]
Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record( record = Record(
token_ids=token_ids, token_ids=token_ids,
text_type_ids=text_type_ids, text_type_ids=text_type_ids,
...@@ -309,5 +342,6 @@ class SequenceLabelReader(BaseReader): ...@@ -309,5 +342,6 @@ class SequenceLabelReader(BaseReader):
label_ids=label_ids) label_ids=label_ids)
return record return record
if __name__ == '__main__': if __name__ == '__main__':
pass pass
...@@ -32,11 +32,11 @@ from finetune.classifier import create_model, evaluate ...@@ -32,11 +32,11 @@ from finetune.classifier import create_model, evaluate
from optimization import optimization from optimization import optimization
from utils.args import ArgumentGroup, print_arguments from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params, init_checkpoint from utils.init import init_pretraining_params, init_checkpoint
from finetune_args import parser from finetune_args import parser
args = parser.parse_args() args = parser.parse_args()
def main(args): def main(args):
ernie_config = ErnieConfig(args.ernie_config_path) ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config() ernie_config.print_config()
...@@ -49,12 +49,13 @@ def main(args): ...@@ -49,12 +49,13 @@ def main(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place) exe = fluid.Executor(place)
reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, reader = task_reader.ClassifyReader(
label_map_config=args.label_map_config, vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len, label_map_config=args.label_map_config,
do_lower_case=args.do_lower_case, max_seq_len=args.max_seq_len,
in_tokens=args.in_tokens, do_lower_case=args.do_lower_case,
random_seed=args.random_seed) in_tokens=args.in_tokens,
random_seed=args.random_seed)
if not (args.do_train or args.do_val or args.do_test): if not (args.do_train or args.do_val or args.do_test):
raise ValueError("For args `do_train`, `do_val` and `do_test`, at " raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
...@@ -108,10 +109,11 @@ def main(args): ...@@ -108,10 +109,11 @@ def main(args):
fluid.memory_optimize( fluid.memory_optimize(
input_program=train_program, input_program=train_program,
skip_opt_set=[graph_vars["loss"].name, skip_opt_set=[
graph_vars["probs"].name, graph_vars["loss"].name,
graph_vars["accuracy"].name, graph_vars["probs"].name,
graph_vars["num_seqs"].name, graph_vars["accuracy"].name,
graph_vars["num_seqs"].name,
]) ])
if args.verbose: if args.verbose:
...@@ -201,7 +203,8 @@ def main(args): ...@@ -201,7 +203,8 @@ def main(args):
if steps % args.skip_steps != 0: if steps % args.skip_steps != 0:
train_exe.run(fetch_list=[]) train_exe.run(fetch_list=[])
else: else:
outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train") outputs = evaluate(train_exe, train_program, train_pyreader,
graph_vars, "train")
if args.verbose: if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
...@@ -217,7 +220,8 @@ def main(args): ...@@ -217,7 +220,8 @@ def main(args):
print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s" % "ave acc: %f, speed: %f steps/s" %
(current_epoch, current_example, num_train_examples, (current_epoch, current_example, num_train_examples,
steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) steps, outputs["loss"], outputs["accuracy"],
args.skip_steps / used_time))
time_begin = time.time() time_begin = time.time()
if steps % args.save_steps == 0: if steps % args.save_steps == 0:
...@@ -254,7 +258,9 @@ def main(args): ...@@ -254,7 +258,9 @@ def main(args):
if args.do_val: if args.do_val:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
reader.data_generator( reader.data_generator(
args.dev_set, batch_size=args.batch_size, epoch=1, args.dev_set,
batch_size=args.batch_size,
epoch=1,
shuffle=False)) shuffle=False))
print("Final validation result:") print("Final validation result:")
evaluate(exe, test_prog, test_pyreader, graph_vars, "dev") evaluate(exe, test_prog, test_pyreader, graph_vars, "dev")
...@@ -273,4 +279,5 @@ def main(args): ...@@ -273,4 +279,5 @@ def main(args):
if __name__ == '__main__': if __name__ == '__main__':
print_arguments(args) print_arguments(args)
main(args) main(args)
...@@ -30,12 +30,12 @@ from model.ernie import ErnieConfig ...@@ -30,12 +30,12 @@ from model.ernie import ErnieConfig
from optimization import optimization from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint from utils.init import init_pretraining_params, init_checkpoint
from utils.args import print_arguments from utils.args import print_arguments
from finetune.sequence_label import create_model, evaluate from finetune.sequence_label import create_model, evaluate
from finetune_args import parser from finetune_args import parser
args = parser.parse_args() args = parser.parse_args()
def main(args): def main(args):
ernie_config = ErnieConfig(args.ernie_config_path) ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config() ernie_config.print_config()
...@@ -48,12 +48,13 @@ def main(args): ...@@ -48,12 +48,13 @@ def main(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place) exe = fluid.Executor(place)
reader = task_reader.SequenceLabelReader(vocab_path=args.vocab_path, reader = task_reader.SequenceLabelReader(
label_map_config=args.label_map_config, vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len, label_map_config=args.label_map_config,
do_lower_case=args.do_lower_case, max_seq_len=args.max_seq_len,
in_tokens=args.in_tokens, do_lower_case=args.do_lower_case,
random_seed=args.random_seed) in_tokens=args.in_tokens,
random_seed=args.random_seed)
if not (args.do_train or args.do_val or args.do_test): if not (args.do_train or args.do_val or args.do_test):
raise ValueError("For args `do_train`, `do_val` and `do_test`, at " raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
...@@ -107,10 +108,9 @@ def main(args): ...@@ -107,10 +108,9 @@ def main(args):
fluid.memory_optimize( fluid.memory_optimize(
input_program=train_program, input_program=train_program,
skip_opt_set=[graph_vars["loss"].name, skip_opt_set=[
graph_vars["labels"].name, graph_vars["loss"].name, graph_vars["labels"].name,
graph_vars["infers"].name, graph_vars["infers"].name, graph_vars["seq_lens"].name
graph_vars["seq_lens"].name
]) ])
if args.verbose: if args.verbose:
...@@ -200,24 +200,26 @@ def main(args): ...@@ -200,24 +200,26 @@ def main(args):
if steps % args.skip_steps != 0: if steps % args.skip_steps != 0:
train_exe.run(fetch_list=[]) train_exe.run(fetch_list=[])
else: else:
outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count) outputs = evaluate(train_exe, train_program, train_pyreader,
graph_vars, args.num_labels, "train",
dev_count)
if args.verbose: if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
) )
verbose += "learning rate: %f" % ( verbose += "learning rate: %f" % (
outputs["lr"] if warmup_steps > 0 else args.learning_rate) outputs["lr"]
if warmup_steps > 0 else args.learning_rate)
print(verbose) print(verbose)
current_example, current_epoch = reader.get_train_progress( current_example, current_epoch = reader.get_train_progress()
)
time_end = time.time() time_end = time.time()
used_time = time_end - time_begin used_time = time_end - time_begin
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"f1: %f, precision: %f, recall: %f, speed: %f steps/s" % "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
(current_epoch, current_example, num_train_examples, % (current_epoch, current_example, num_train_examples,
steps, outputs["loss"], outputs["f1"], steps, outputs["loss"], outputs["f1"],
outputs["precision"], outputs["recall"], outputs["precision"], outputs["recall"],
args.skip_steps / used_time)) args.skip_steps / used_time))
time_begin = time.time() time_begin = time.time()
if steps % args.save_steps == 0: if steps % args.save_steps == 0:
...@@ -234,7 +236,8 @@ def main(args): ...@@ -234,7 +236,8 @@ def main(args):
batch_size=args.batch_size, batch_size=args.batch_size,
epoch=1, epoch=1,
shuffle=False)) shuffle=False))
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") evaluate(exe, test_prog, test_pyreader, graph_vars,
args.num_labels, "dev")
# evaluate test set # evaluate test set
if args.do_test: if args.do_test:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
...@@ -243,7 +246,8 @@ def main(args): ...@@ -243,7 +246,8 @@ def main(args):
batch_size=args.batch_size, batch_size=args.batch_size,
epoch=1, epoch=1,
shuffle=False)) shuffle=False))
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") evaluate(exe, test_prog, test_pyreader, graph_vars,
args.num_labels, "test")
except fluid.core.EOFException: except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps)) save_path = os.path.join(args.checkpoints, "step_" + str(steps))
...@@ -255,7 +259,9 @@ def main(args): ...@@ -255,7 +259,9 @@ def main(args):
if args.do_val: if args.do_val:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
reader.data_generator( reader.data_generator(
args.dev_set, batch_size=args.batch_size, epoch=1, args.dev_set,
batch_size=args.batch_size,
epoch=1,
shuffle=False)) shuffle=False))
print("Final validation result:") print("Final validation result:")
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev")
......
...@@ -35,8 +35,10 @@ from utils.init import init_checkpoint, init_pretraining_params ...@@ -35,8 +35,10 @@ from utils.init import init_checkpoint, init_pretraining_params
from pretrain_args import parser from pretrain_args import parser
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
def create_model(pyreader_name, ernie_config): def create_model(pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader( pyreader = fluid.layers.py_reader(
capacity=70, capacity=70,
...@@ -224,8 +226,7 @@ def train(args): ...@@ -224,8 +226,7 @@ def train(args):
print("train_id == 0, sleep 60s") print("train_id == 0, sleep 60s")
time.sleep(60) time.sleep(60)
print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
trainer_id:{}" trainer_id:{}".format(worker_endpoints, trainers_num,
.format(worker_endpoints, trainers_num,
current_endpoint, trainer_id)) current_endpoint, trainer_id))
# prepare nccl2 env. # prepare nccl2 env.
...@@ -319,13 +320,14 @@ def train(args): ...@@ -319,13 +320,14 @@ def train(args):
epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
) )
print("current learning_rate:%f" % np_lr[0]) print("current learning_rate:%f" % np_lr[0])
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " print(
"ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
% (epoch, current_file_index, total_file, steps, "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
np.mean(np.array(cost)), % (epoch, current_file_index, total_file, steps,
np.mean(np.exp(np.array(lm_cost))), np.mean(np.array(cost)),
np.mean(np.array(acc)), skip_steps / used_time, np.mean(np.exp(np.array(lm_cost))),
current_file, mask_type)) np.mean(np.array(acc)), skip_steps / used_time,
current_file, mask_type))
cost = [] cost = []
lm_cost = [] lm_cost = []
acc = [] acc = []
...@@ -341,8 +343,7 @@ def train(args): ...@@ -341,8 +343,7 @@ def train(args):
print("[validation_set] epoch: %d, step: %d, " print("[validation_set] epoch: %d, step: %d, "
"loss: %f, global ppl: %f, batch-averged ppl: %f, " "loss: %f, global ppl: %f, batch-averged ppl: %f, "
"next_sent_acc: %f, speed: %f steps/s" % "next_sent_acc: %f, speed: %f steps/s" %
(epoch, steps, (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
np.mean(np.array(vali_cost) / vali_steps),
np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.array(vali_acc) / vali_steps), vali_speed)) np.mean(np.array(vali_acc) / vali_steps), vali_speed))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册