提交 78489c4b 编写于 作者: T tianxin04

format

上级 354d97a8
......@@ -19,7 +19,15 @@ from __future__ import print_function
import numpy as np
def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
def mask(batch_tokens,
seg_labels,
mask_word_tags,
total_token_num,
vocab_size,
CLS=1,
SEP=2,
MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
......@@ -90,7 +98,8 @@ def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size,
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
sent[token_index] = replace_ids[prob_index +
token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
......@@ -143,7 +152,10 @@ def prepare_batch_data(insts,
pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id)
sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id)
return_list = [src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, next_sent_index]
return_list = [
src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels,
next_sent_index
]
return return_list
......@@ -207,4 +219,5 @@ def pad_batch_data(insts,
if __name__ == "__main__":
pass
......@@ -25,22 +25,20 @@ import paddle.fluid as fluid
from model.ernie import ErnieModel
def create_model(args,
pyreader_name,
ernie_config,
is_prediction=False):
def create_model(args, pyreader_name, ernie_config, is_prediction=False):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], [-1, 1]],
[-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1],
[-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, sent_ids, pos_ids, self_attn_mask, labels,
next_sent_index, qids) = fluid.layers.read_file(pyreader)
(src_ids, sent_ids, pos_ids, self_attn_mask, labels, next_sent_index,
qids) = fluid.layers.read_file(pyreader)
ernie = ErnieModel(
src_ids=src_ids,
......@@ -57,7 +55,7 @@ def create_model(args,
dropout_implementation="upscale_in_train")
logits = fluid.layers.fc(
input=cls_feats,
size=ernie_config["num_labels"],
size=args.num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
......@@ -82,18 +80,21 @@ def create_model(args,
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
graph_vars = {"loss": loss,
graph_vars = {
"loss": loss,
"probs": probs,
"accuracy": accuracy,
"labels": labels,
"num_seqs": num_seqs,
"qids": qids}
"qids": qids
}
for k, v in graph_vars.items():
v.persistable=True
v.persistable = True
return pyreader, graph_vars
def evaluate_mrr(preds):
last_qid = None
total_mrr = 0.0
......@@ -114,6 +115,7 @@ def evaluate_mrr(preds):
return total_mrr / qnum
def evaluate_map(preds):
def singe_map(st, en):
total_p = 0.0
......@@ -142,9 +144,10 @@ def evaluate_map(preds):
total_map += singe_map(st, len(preds))
return total_map / qnum
def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
train_fetch_list = [graph_vars["loss"].name,
graph_vars["accuracy"].name,
train_fetch_list = [
graph_vars["loss"].name, graph_vars["accuracy"].name,
graph_vars["num_seqs"].name
]
......@@ -152,7 +155,7 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
if "learning_rate" in graph_vars:
train_fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=train_fetch_list)
ret = {"loss":np.mean(outputs[0]), "accuracy":np.mean(outputs[1])}
ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])}
if "learning_rate" in graph_vars:
ret["learning_rate"] = float(outputs[4][0])
return ret
......@@ -162,22 +165,21 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
qids, labels, scores = [], [], []
time_begin = time.time()
fetch_list = [graph_vars["loss"].name,
graph_vars["accuracy"].name,
graph_vars["probs"].name,
graph_vars["labels"].name,
graph_vars["num_seqs"].name,
graph_vars["qids"].name]
fetch_list = [
graph_vars["loss"].name, graph_vars["accuracy"].name,
graph_vars["probs"].name, graph_vars["labels"].name,
graph_vars["num_seqs"].name, graph_vars["qids"].name
]
while True:
try:
np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(program=test_program,
fetch_list=fetch_list)
np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
program=test_program, fetch_list=fetch_list)
total_cost += np.sum(np_loss * np_num_seqs)
total_acc += np.sum(np_acc * np_num_seqs)
total_num_seqs += np.sum(np_num_seqs)
labels.extend(np_labels.reshape((-1)).tolist())
qids.extend(np_qids.reshape(-1).tolist())
scores.extend(np_probs[:,1].reshape(-1).tolist())
scores.extend(np_probs[:, 1].reshape(-1).tolist())
np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
total_label_pos_num += np.sum(np_labels)
total_pred_pos_num += np.sum(np_preds)
......@@ -188,20 +190,23 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase):
time_end = time.time()
if len(qids) == 0:
print("[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" %
(eval_phase, total_cost / total_num_seqs,
total_acc / total_num_seqs, total_num_seqs, time_end - time_begin))
print(
"[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
% (eval_phase, total_cost / total_num_seqs, total_acc /
total_num_seqs, total_num_seqs, time_end - time_begin))
else:
r = total_correct_num / total_label_pos_num
p = total_correct_num / total_pred_pos_num
f = 2 * p * r / (p + r)
assert len(qids) == len(labels) == len(scores)
preds = sorted(zip(qids, scores, labels), key=lambda elem:(elem[0], -elem[1]))
preds = sorted(
zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
mrr = evaluate_mrr(preds)
map = evaluate_map(preds)
print("[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" %
(eval_phase, total_cost / total_num_seqs,
total_acc / total_num_seqs,
mrr, map, p, r, f, total_num_seqs, time_end - time_begin))
print(
"[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s"
% (eval_phase, total_cost / total_num_seqs,
total_acc / total_num_seqs, mrr, map, p, r, f, total_num_seqs,
time_end - time_begin))
......@@ -74,3 +74,4 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
# yapf: enable
......@@ -24,7 +24,6 @@ from utils.args import ArgumentGroup, print_arguments
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
......
......@@ -30,6 +30,7 @@ import paddle.fluid as fluid
from batching import prepare_batch_data
class ErnieDataReader(object):
def __init__(self,
filelist,
......@@ -81,8 +82,8 @@ class ErnieDataReader(object):
sent_ids = [int(token) for token in sent_ids.split(" ")]
pos_ids = [int(token) for token in pos_ids.split(" ")]
seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")]
assert len(token_ids) == len(sent_ids) == len(
pos_ids) == len(seg_labels
assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
seg_labels
), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
label = int(label)
if len(token_ids) > max_seq_len:
......@@ -153,13 +154,16 @@ class ErnieDataReader(object):
if left_len <= max_len:
return (token_seq[1:sep_index], seg_labels[1:sep_index])
else:
return [token_seq[sep_index + 1: -1], seg_labels[sep_index + 1 : -1]]
return [
token_seq[sep_index + 1:-1], seg_labels[sep_index + 1:-1]
]
for i in range(num_sample):
pair_index = (i + 1) % num_sample
left_tokens, left_seg_labels = split_sent(pos_samples[i],
(self.max_seq_len - 3) // 2, self.sep_id)
right_tokens, right_seg_labels = split_sent(pos_samples[pair_index],
left_tokens, left_seg_labels = split_sent(
pos_samples[i], (self.max_seq_len - 3) // 2, self.sep_id)
right_tokens, right_seg_labels = split_sent(
pos_samples[pair_index],
self.max_seq_len - 3 - len(left_tokens), self.sep_id)
token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \
......
......@@ -148,7 +148,9 @@ class BaseReader(object):
else:
label_id = example.label
Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
qid = None
if "qid" in example._fields:
......@@ -168,7 +170,8 @@ class BaseReader(object):
for index, example in enumerate(examples):
if phase == "train":
self.current_example = index
record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer)
record = self._convert_example_to_record(example, self.max_seq_len,
self.tokenizer)
max_len = max(max_len, len(record.token_ids))
if self.in_tokens:
to_append = (len(batch_records) + 1) * max_len <= batch_size
......@@ -187,7 +190,12 @@ class BaseReader(object):
examples = self._read_tsv(input_file)
return len(examples)
def data_generator(self, input_file, batch_size, epoch, shuffle=True, phase=None):
def data_generator(self,
input_file,
batch_size,
epoch,
shuffle=True,
phase=None):
examples = self._read_tsv(input_file)
def wrapper():
......@@ -198,8 +206,10 @@ class BaseReader(object):
if shuffle:
np.random.shuffle(examples)
for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase):
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield batch_data
return wrapper
......@@ -209,7 +219,9 @@ class ClassifyReader(BaseReader):
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
headers = next(reader)
text_indices = [index for index, h in enumerate(headers) if h != "label"]
text_indices = [
index for index, h in enumerate(headers) if h != "label"
]
Example = namedtuple('Example', headers)
examples = []
......@@ -236,17 +248,24 @@ class ClassifyReader(BaseReader):
# padding
padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True)
padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id)
return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids]
batch_token_ids,
pad_idx=self.pad_id,
return_next_sent_pos=True,
return_attn_bias=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, batch_labels, next_sent_index, batch_qids
]
return return_list
class SequenceLabelReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
......@@ -256,13 +275,23 @@ class SequenceLabelReader(BaseReader):
# padding
padded_token_ids, self_attn_bias = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True)
padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map)-1)
batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape([-1, 1])
return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens]
batch_token_ids,
pad_idx=self.pad_id,
return_next_sent_pos=False,
return_attn_bias=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(
batch_label_ids, pad_idx=len(self.label_map) - 1)
batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape(
[-1, 1])
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
self_attn_bias, padded_label_ids, batch_seq_lens
]
return return_list
def _reseg_token_label(self, tokens, labels, tokenizer):
......@@ -299,9 +328,13 @@ class SequenceLabelReader(BaseReader):
position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1
label_ids = [no_entity_id] + [self.label_map[label] for label in labels] + [no_entity_id]
label_ids = [no_entity_id] + [
self.label_map[label] for label in labels
] + [no_entity_id]
Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
......@@ -309,5 +342,6 @@ class SequenceLabelReader(BaseReader):
label_ids=label_ids)
return record
if __name__ == '__main__':
pass
......@@ -34,9 +34,9 @@ from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params, init_checkpoint
from finetune_args import parser
args = parser.parse_args()
def main(args):
ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config()
......@@ -49,7 +49,8 @@ def main(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
reader = task_reader.ClassifyReader(
vocab_path=args.vocab_path,
label_map_config=args.label_map_config,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
......@@ -108,7 +109,8 @@ def main(args):
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[graph_vars["loss"].name,
skip_opt_set=[
graph_vars["loss"].name,
graph_vars["probs"].name,
graph_vars["accuracy"].name,
graph_vars["num_seqs"].name,
......@@ -201,7 +203,8 @@ def main(args):
if steps % args.skip_steps != 0:
train_exe.run(fetch_list=[])
else:
outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train")
outputs = evaluate(train_exe, train_program, train_pyreader,
graph_vars, "train")
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
......@@ -217,7 +220,8 @@ def main(args):
print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s" %
(current_epoch, current_example, num_train_examples,
steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time))
steps, outputs["loss"], outputs["accuracy"],
args.skip_steps / used_time))
time_begin = time.time()
if steps % args.save_steps == 0:
......@@ -254,7 +258,9 @@ def main(args):
if args.do_val:
test_pyreader.decorate_tensor_provider(
reader.data_generator(
args.dev_set, batch_size=args.batch_size, epoch=1,
args.dev_set,
batch_size=args.batch_size,
epoch=1,
shuffle=False))
print("Final validation result:")
evaluate(exe, test_prog, test_pyreader, graph_vars, "dev")
......@@ -273,4 +279,5 @@ def main(args):
if __name__ == '__main__':
print_arguments(args)
main(args)
......@@ -33,9 +33,9 @@ from utils.args import print_arguments
from finetune.sequence_label import create_model, evaluate
from finetune_args import parser
args = parser.parse_args()
def main(args):
ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config()
......@@ -48,7 +48,8 @@ def main(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
reader = task_reader.SequenceLabelReader(vocab_path=args.vocab_path,
reader = task_reader.SequenceLabelReader(
vocab_path=args.vocab_path,
label_map_config=args.label_map_config,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
......@@ -107,10 +108,9 @@ def main(args):
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[graph_vars["loss"].name,
graph_vars["labels"].name,
graph_vars["infers"].name,
graph_vars["seq_lens"].name
skip_opt_set=[
graph_vars["loss"].name, graph_vars["labels"].name,
graph_vars["infers"].name, graph_vars["seq_lens"].name
])
if args.verbose:
......@@ -200,21 +200,23 @@ def main(args):
if steps % args.skip_steps != 0:
train_exe.run(fetch_list=[])
else:
outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count)
outputs = evaluate(train_exe, train_program, train_pyreader,
graph_vars, args.num_labels, "train",
dev_count)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
outputs["lr"] if warmup_steps > 0 else args.learning_rate)
outputs["lr"]
if warmup_steps > 0 else args.learning_rate)
print(verbose)
current_example, current_epoch = reader.get_train_progress(
)
current_example, current_epoch = reader.get_train_progress()
time_end = time.time()
used_time = time_end - time_begin
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"f1: %f, precision: %f, recall: %f, speed: %f steps/s" %
(current_epoch, current_example, num_train_examples,
"f1: %f, precision: %f, recall: %f, speed: %f steps/s"
% (current_epoch, current_example, num_train_examples,
steps, outputs["loss"], outputs["f1"],
outputs["precision"], outputs["recall"],
args.skip_steps / used_time))
......@@ -234,7 +236,8 @@ def main(args):
batch_size=args.batch_size,
epoch=1,
shuffle=False))
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev")
evaluate(exe, test_prog, test_pyreader, graph_vars,
args.num_labels, "dev")
# evaluate test set
if args.do_test:
test_pyreader.decorate_tensor_provider(
......@@ -243,7 +246,8 @@ def main(args):
batch_size=args.batch_size,
epoch=1,
shuffle=False))
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test")
evaluate(exe, test_prog, test_pyreader, graph_vars,
args.num_labels, "test")
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps))
......@@ -255,7 +259,9 @@ def main(args):
if args.do_val:
test_pyreader.decorate_tensor_provider(
reader.data_generator(
args.dev_set, batch_size=args.batch_size, epoch=1,
args.dev_set,
batch_size=args.batch_size,
epoch=1,
shuffle=False))
print("Final validation result:")
evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev")
......
......@@ -35,8 +35,10 @@ from utils.init import init_checkpoint, init_pretraining_params
from pretrain_args import parser
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader(
capacity=70,
......@@ -224,8 +226,7 @@ def train(args):
print("train_id == 0, sleep 60s")
time.sleep(60)
print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
trainer_id:{}"
.format(worker_endpoints, trainers_num,
trainer_id:{}".format(worker_endpoints, trainers_num,
current_endpoint, trainer_id))
# prepare nccl2 env.
......@@ -319,7 +320,8 @@ def train(args):
epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
)
print("current learning_rate:%f" % np_lr[0])
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
print(
"epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
% (epoch, current_file_index, total_file, steps,
np.mean(np.array(cost)),
......@@ -341,8 +343,7 @@ def train(args):
print("[validation_set] epoch: %d, step: %d, "
"loss: %f, global ppl: %f, batch-averged ppl: %f, "
"next_sent_acc: %f, speed: %f steps/s" %
(epoch, steps,
np.mean(np.array(vali_cost) / vali_steps),
(epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.array(vali_acc) / vali_steps), vali_speed))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册