diff --git a/ERNIE/batching.py b/ERNIE/batching.py index 618f66206965df45a2646ffae0d35c7bf83fb4e5..beea08241c7e3607ec226895901bef59ca097237 100644 --- a/ERNIE/batching.py +++ b/ERNIE/batching.py @@ -19,7 +19,15 @@ from __future__ import print_function import numpy as np -def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): + +def mask(batch_tokens, + seg_labels, + mask_word_tags, + total_token_num, + vocab_size, + CLS=1, + SEP=2, + MASK=3): """ Add mask for batch_tokens, return out, mask_label, mask_pos; Note: mask_pos responding the batch_tokens after padded; @@ -90,7 +98,8 @@ def mask(batch_tokens, seg_labels, mask_word_tags, total_token_num, vocab_size, # random replace if token != SEP and token != CLS: mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + token_index] + sent[token_index] = replace_ids[prob_index + + token_index] mask_flag = True mask_pos.append(sent_index * max_len + token_index) else: @@ -143,7 +152,10 @@ def prepare_batch_data(insts, pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) - return_list = [src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, next_sent_index] + return_list = [ + src_id, pos_id, sent_id, self_attn_bias, mask_label, mask_pos, labels, + next_sent_index + ] return return_list @@ -207,4 +219,5 @@ def pad_batch_data(insts, if __name__ == "__main__": + pass diff --git a/ERNIE/finetune/classifier.py b/ERNIE/finetune/classifier.py index 0e1f1f9d217b3a6eb6ed15f7fab6497b32446132..b5609af32f8399228e5b2450d07760e253ea7c60 100644 --- a/ERNIE/finetune/classifier.py +++ b/ERNIE/finetune/classifier.py @@ -25,22 +25,20 @@ import paddle.fluid as fluid from model.ernie import ErnieModel -def create_model(args, - pyreader_name, - ernie_config, - is_prediction=False): +def create_model(args, pyreader_name, ernie_config, is_prediction=False): pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], [-1, 1]], + [-1, args.max_seq_len, args.max_seq_len], [-1, 1], [-1, 1], + [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float', 'int64', 'int64', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) - (src_ids, sent_ids, pos_ids, self_attn_mask, labels, - next_sent_index, qids) = fluid.layers.read_file(pyreader) + (src_ids, sent_ids, pos_ids, self_attn_mask, labels, next_sent_index, + qids) = fluid.layers.read_file(pyreader) ernie = ErnieModel( src_ids=src_ids, @@ -57,7 +55,7 @@ def create_model(args, dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, - size=ernie_config["num_labels"], + size=args.num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), @@ -82,18 +80,21 @@ def create_model(args, num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) - graph_vars = {"loss": loss, - "probs": probs, - "accuracy": accuracy, - "labels": labels, - "num_seqs": num_seqs, - "qids": qids} + graph_vars = { + "loss": loss, + "probs": probs, + "accuracy": accuracy, + "labels": labels, + "num_seqs": num_seqs, + "qids": qids + } for k, v in graph_vars.items(): - v.persistable=True + v.persistable = True return pyreader, graph_vars + def evaluate_mrr(preds): last_qid = None total_mrr = 0.0 @@ -114,6 +115,7 @@ def evaluate_mrr(preds): return total_mrr / qnum + def evaluate_map(preds): def singe_map(st, en): total_p = 0.0 @@ -142,17 +144,18 @@ def evaluate_map(preds): total_map += singe_map(st, len(preds)) return total_map / qnum + def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): - train_fetch_list = [graph_vars["loss"].name, - graph_vars["accuracy"].name, - graph_vars["num_seqs"].name - ] + train_fetch_list = [ + graph_vars["loss"].name, graph_vars["accuracy"].name, + graph_vars["num_seqs"].name + ] if eval_phase == "train": if "learning_rate" in graph_vars: train_fetch_list.append(graph_vars["learning_rate"].name) outputs = exe.run(fetch_list=train_fetch_list) - ret = {"loss":np.mean(outputs[0]), "accuracy":np.mean(outputs[1])} + ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])} if "learning_rate" in graph_vars: ret["learning_rate"] = float(outputs[4][0]) return ret @@ -162,22 +165,21 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): qids, labels, scores = [], [], [] time_begin = time.time() - fetch_list = [graph_vars["loss"].name, - graph_vars["accuracy"].name, - graph_vars["probs"].name, - graph_vars["labels"].name, - graph_vars["num_seqs"].name, - graph_vars["qids"].name] + fetch_list = [ + graph_vars["loss"].name, graph_vars["accuracy"].name, + graph_vars["probs"].name, graph_vars["labels"].name, + graph_vars["num_seqs"].name, graph_vars["qids"].name + ] while True: try: - np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(program=test_program, - fetch_list=fetch_list) + np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run( + program=test_program, fetch_list=fetch_list) total_cost += np.sum(np_loss * np_num_seqs) total_acc += np.sum(np_acc * np_num_seqs) total_num_seqs += np.sum(np_num_seqs) labels.extend(np_labels.reshape((-1)).tolist()) qids.extend(np_qids.reshape(-1).tolist()) - scores.extend(np_probs[:,1].reshape(-1).tolist()) + scores.extend(np_probs[:, 1].reshape(-1).tolist()) np_preds = np.argmax(np_probs, axis=1).astype(np.float32) total_label_pos_num += np.sum(np_labels) total_pred_pos_num += np.sum(np_preds) @@ -188,20 +190,23 @@ def evaluate(exe, test_program, test_pyreader, graph_vars, eval_phase): time_end = time.time() if len(qids) == 0: - print("[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" % - (eval_phase, total_cost / total_num_seqs, - total_acc / total_num_seqs, total_num_seqs, time_end - time_begin)) + print( + "[%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, total_acc / + total_num_seqs, total_num_seqs, time_end - time_begin)) else: r = total_correct_num / total_label_pos_num p = total_correct_num / total_pred_pos_num f = 2 * p * r / (p + r) assert len(qids) == len(labels) == len(scores) - preds = sorted(zip(qids, scores, labels), key=lambda elem:(elem[0], -elem[1])) + preds = sorted( + zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1])) mrr = evaluate_mrr(preds) map = evaluate_map(preds) - print("[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" % - (eval_phase, total_cost / total_num_seqs, - total_acc / total_num_seqs, - mrr, map, p, r, f, total_num_seqs, time_end - time_begin)) + print( + "[%s evaluation] ave loss: %f, ave_acc: %f, mrr: %f, map: %f, p: %f, r: %f, f1: %f, data_num: %d, elapsed time: %f s" + % (eval_phase, total_cost / total_num_seqs, + total_acc / total_num_seqs, mrr, map, p, r, f, total_num_seqs, + time_end - time_begin)) diff --git a/ERNIE/finetune_args.py b/ERNIE/finetune_args.py index 1f7d9349b8087f57addfbdf9cf8b1bf48156d65e..f25e5ab0cc0d76d18838437423aed8499d605e18 100644 --- a/ERNIE/finetune_args.py +++ b/ERNIE/finetune_args.py @@ -64,7 +64,7 @@ data_g.add_arg("do_lower_case", bool, True, "Whether to lower case the input text. Should be True for uncased models and False for cased models.") data_g.add_arg("random_seed", int, 0, "Random seed.") data_g.add_arg("label_map_config", str, None, "label_map_path.") -data_g.add_arg("num_labels", int, 2, "label number") +data_g.add_arg("num_labels", int, 2, "label number") run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") @@ -74,3 +74,4 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") +# yapf: enable diff --git a/ERNIE/pretrain_args.py b/ERNIE/pretrain_args.py index 2543a346ff7cab114a9418978c2e5dfc5b018a5a..a552f3c34051ca8d0dd070bc49cef2db8aef0de2 100644 --- a/ERNIE/pretrain_args.py +++ b/ERNIE/pretrain_args.py @@ -24,7 +24,6 @@ from utils.args import ArgumentGroup, print_arguments # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser = argparse.ArgumentParser(__doc__) model_g = ArgumentGroup(parser, "model", "model configuration and paths.") model_g.add_arg("ernie_config_path", str, "./config/ernie_config.json", "Path to the json file for ernie model config.") model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") diff --git a/ERNIE/reader/pretraining.py b/ERNIE/reader/pretraining.py index f8c3d883ec66571b910e34bc3963db6e6778e2d2..67d0e593591110db785191ad3f38b67c7a3e0d2d 100644 --- a/ERNIE/reader/pretraining.py +++ b/ERNIE/reader/pretraining.py @@ -30,6 +30,7 @@ import paddle.fluid as fluid from batching import prepare_batch_data + class ErnieDataReader(object): def __init__(self, filelist, @@ -81,8 +82,8 @@ class ErnieDataReader(object): sent_ids = [int(token) for token in sent_ids.split(" ")] pos_ids = [int(token) for token in pos_ids.split(" ")] seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")] - assert len(token_ids) == len(sent_ids) == len( - pos_ids) == len(seg_labels + assert len(token_ids) == len(sent_ids) == len(pos_ids) == len( + seg_labels ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" label = int(label) if len(token_ids) > max_seq_len: @@ -153,14 +154,17 @@ class ErnieDataReader(object): if left_len <= max_len: return (token_seq[1:sep_index], seg_labels[1:sep_index]) else: - return [token_seq[sep_index + 1: -1], seg_labels[sep_index + 1 : -1]] + return [ + token_seq[sep_index + 1:-1], seg_labels[sep_index + 1:-1] + ] for i in range(num_sample): pair_index = (i + 1) % num_sample - left_tokens, left_seg_labels = split_sent(pos_samples[i], - (self.max_seq_len - 3) // 2, self.sep_id) - right_tokens, right_seg_labels = split_sent(pos_samples[pair_index], - self.max_seq_len - 3 - len(left_tokens), self.sep_id) + left_tokens, left_seg_labels = split_sent( + pos_samples[i], (self.max_seq_len - 3) // 2, self.sep_id) + right_tokens, right_seg_labels = split_sent( + pos_samples[pair_index], + self.max_seq_len - 3 - len(left_tokens), self.sep_id) token_seq = [self.cls_id] + left_tokens + [self.sep_id] + \ right_tokens + [self.sep_id] diff --git a/ERNIE/reader/task_reader.py b/ERNIE/reader/task_reader.py index 28aacbdbfc9c544a170f6a969605eb464112d335..74130d6c8ff17a1a715d231ba68e10561c11622a 100644 --- a/ERNIE/reader/task_reader.py +++ b/ERNIE/reader/task_reader.py @@ -62,7 +62,7 @@ class BaseReader(object): reader = csv.reader(f, delimiter="\t", quotechar=quotechar) headers = next(reader) Example = namedtuple('Example', headers) - + examples = [] for line in reader: example = Example(*line) @@ -85,7 +85,7 @@ class BaseReader(object): else: tokens_b.pop() - def _convert_example_to_record(self, example, max_seq_length, tokenizer): + def _convert_example_to_record(self, example, max_seq_length, tokenizer): """Converts a single `Example` into a single `Record`.""" text_a = tokenization.convert_to_unicode(example.text_a) @@ -148,7 +148,9 @@ class BaseReader(object): else: label_id = example.label - Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) qid = None if "qid" in example._fields: @@ -164,11 +166,12 @@ class BaseReader(object): def _prepare_batch_data(self, examples, batch_size, phase=None): """generate batch records""" - batch_records, max_len = [], 0 + batch_records, max_len = [], 0 for index, example in enumerate(examples): if phase == "train": self.current_example = index - record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer) + record = self._convert_example_to_record(example, self.max_seq_len, + self.tokenizer) max_len = max(max_len, len(record.token_ids)) if self.in_tokens: to_append = (len(batch_records) + 1) * max_len <= batch_size @@ -187,7 +190,12 @@ class BaseReader(object): examples = self._read_tsv(input_file) return len(examples) - def data_generator(self, input_file, batch_size, epoch, shuffle=True, phase=None): + def data_generator(self, + input_file, + batch_size, + epoch, + shuffle=True, + phase=None): examples = self._read_tsv(input_file) def wrapper(): @@ -198,8 +206,10 @@ class BaseReader(object): if shuffle: np.random.shuffle(examples) - for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase): + for batch_data in self._prepare_batch_data( + examples, batch_size, phase=phase): yield batch_data + return wrapper @@ -209,9 +219,11 @@ class ClassifyReader(BaseReader): with open(input_file, "r") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) headers = next(reader) - text_indices = [index for index, h in enumerate(headers) if h != "label"] + text_indices = [ + index for index, h in enumerate(headers) if h != "label" + ] Example = namedtuple('Example', headers) - + examples = [] for line in reader: for index, text in enumerate(line): @@ -219,8 +231,8 @@ class ClassifyReader(BaseReader): line[index] = text.replace(' ', '') example = Example(*line) examples.append(example) - return examples - + return examples + def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] @@ -236,33 +248,50 @@ class ClassifyReader(BaseReader): # padding padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data( - batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=True, return_attn_bias=True) - padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) - - return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, batch_labels, next_sent_index, batch_qids] + batch_token_ids, + pad_idx=self.pad_id, + return_next_sent_pos=True, + return_attn_bias=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + self_attn_bias, batch_labels, next_sent_index, batch_qids + ] return return_list class SequenceLabelReader(BaseReader): - def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_position_ids = [record.position_ids for record in batch_records] batch_label_ids = [record.label_ids for record in batch_records] - batch_seq_lens = [len(record.token_ids) for record in batch_records] + batch_seq_lens = [len(record.token_ids) for record in batch_records] # padding padded_token_ids, self_attn_bias = pad_batch_data( - batch_token_ids, pad_idx=self.pad_id, return_next_sent_pos=False, return_attn_bias=True) - padded_text_type_ids = pad_batch_data(batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data(batch_position_ids, pad_idx=self.pad_id) - padded_label_ids = pad_batch_data(batch_label_ids, pad_idx=len(self.label_map)-1) - batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape([-1, 1]) - - return_list = [padded_token_ids, padded_text_type_ids, padded_position_ids, self_attn_bias, padded_label_ids, batch_seq_lens] + batch_token_ids, + pad_idx=self.pad_id, + return_next_sent_pos=False, + return_attn_bias=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_label_ids = pad_batch_data( + batch_label_ids, pad_idx=len(self.label_map) - 1) + batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape( + [-1, 1]) + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + self_attn_bias, padded_label_ids, batch_seq_lens + ] return return_list def _reseg_token_label(self, tokens, labels, tokenizer): @@ -285,7 +314,7 @@ class SequenceLabelReader(BaseReader): assert len(ret_tokens) == len(ret_labels) return ret_tokens, ret_labels - def _convert_example_to_record(self, example, max_seq_length, tokenizer): + def _convert_example_to_record(self, example, max_seq_length, tokenizer): tokens = tokenization.convert_to_unicode(example.text_a).split(u"") labels = tokenization.convert_to_unicode(example.label).split(u"") tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) @@ -297,11 +326,15 @@ class SequenceLabelReader(BaseReader): tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) - text_type_ids = [0] * len(token_ids) + text_type_ids = [0] * len(token_ids) no_entity_id = len(self.label_map) - 1 - label_ids = [no_entity_id] + [self.label_map[label] for label in labels] + [no_entity_id] + label_ids = [no_entity_id] + [ + self.label_map[label] for label in labels + ] + [no_entity_id] - Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) + Record = namedtuple( + 'Record', + ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) record = Record( token_ids=token_ids, text_type_ids=text_type_ids, @@ -309,5 +342,6 @@ class SequenceLabelReader(BaseReader): label_ids=label_ids) return record + if __name__ == '__main__': pass diff --git a/ERNIE/run_classifier.py b/ERNIE/run_classifier.py index a891a37f9a3dadb089d8d07644c98a90ef154378..df024902887fb1cf4a1ad03ba83a29de18431e3e 100644 --- a/ERNIE/run_classifier.py +++ b/ERNIE/run_classifier.py @@ -32,11 +32,11 @@ from finetune.classifier import create_model, evaluate from optimization import optimization from utils.args import ArgumentGroup, print_arguments from utils.init import init_pretraining_params, init_checkpoint -from finetune_args import parser - +from finetune_args import parser args = parser.parse_args() + def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() @@ -49,12 +49,13 @@ def main(args): dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) - reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, - label_map_config=args.label_map_config, - max_seq_len=args.max_seq_len, - do_lower_case=args.do_lower_case, - in_tokens=args.in_tokens, - random_seed=args.random_seed) + reader = task_reader.ClassifyReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + random_seed=args.random_seed) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " @@ -108,10 +109,11 @@ def main(args): fluid.memory_optimize( input_program=train_program, - skip_opt_set=[graph_vars["loss"].name, - graph_vars["probs"].name, - graph_vars["accuracy"].name, - graph_vars["num_seqs"].name, + skip_opt_set=[ + graph_vars["loss"].name, + graph_vars["probs"].name, + graph_vars["accuracy"].name, + graph_vars["num_seqs"].name, ]) if args.verbose: @@ -201,7 +203,8 @@ def main(args): if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: - outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, "train") + outputs = evaluate(train_exe, train_program, train_pyreader, + graph_vars, "train") if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( @@ -217,7 +220,8 @@ def main(args): print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_epoch, current_example, num_train_examples, - steps, outputs["loss"], outputs["accuracy"], args.skip_steps / used_time)) + steps, outputs["loss"], outputs["accuracy"], + args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: @@ -254,7 +258,9 @@ def main(args): if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( - args.dev_set, batch_size=args.batch_size, epoch=1, + args.dev_set, + batch_size=args.batch_size, + epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, graph_vars, "dev") @@ -273,4 +279,5 @@ def main(args): if __name__ == '__main__': print_arguments(args) + main(args) diff --git a/ERNIE/run_sequence_labeling.py b/ERNIE/run_sequence_labeling.py index 71158cf219e1583ea6427448a947740cb86ecc06..1ee7544293e3bf920b21229a90b1341966617f18 100644 --- a/ERNIE/run_sequence_labeling.py +++ b/ERNIE/run_sequence_labeling.py @@ -30,12 +30,12 @@ from model.ernie import ErnieConfig from optimization import optimization from utils.init import init_pretraining_params, init_checkpoint from utils.args import print_arguments -from finetune.sequence_label import create_model, evaluate -from finetune_args import parser - +from finetune.sequence_label import create_model, evaluate +from finetune_args import parser args = parser.parse_args() + def main(args): ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() @@ -48,12 +48,13 @@ def main(args): dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) - reader = task_reader.SequenceLabelReader(vocab_path=args.vocab_path, - label_map_config=args.label_map_config, - max_seq_len=args.max_seq_len, - do_lower_case=args.do_lower_case, - in_tokens=args.in_tokens, - random_seed=args.random_seed) + reader = task_reader.SequenceLabelReader( + vocab_path=args.vocab_path, + label_map_config=args.label_map_config, + max_seq_len=args.max_seq_len, + do_lower_case=args.do_lower_case, + in_tokens=args.in_tokens, + random_seed=args.random_seed) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " @@ -107,10 +108,9 @@ def main(args): fluid.memory_optimize( input_program=train_program, - skip_opt_set=[graph_vars["loss"].name, - graph_vars["labels"].name, - graph_vars["infers"].name, - graph_vars["seq_lens"].name + skip_opt_set=[ + graph_vars["loss"].name, graph_vars["labels"].name, + graph_vars["infers"].name, graph_vars["seq_lens"].name ]) if args.verbose: @@ -200,24 +200,26 @@ def main(args): if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: - outputs = evaluate(train_exe, train_program, train_pyreader, graph_vars, args.num_labels, "train", dev_count) + outputs = evaluate(train_exe, train_program, train_pyreader, + graph_vars, args.num_labels, "train", + dev_count) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) verbose += "learning rate: %f" % ( - outputs["lr"] if warmup_steps > 0 else args.learning_rate) + outputs["lr"] + if warmup_steps > 0 else args.learning_rate) print(verbose) - current_example, current_epoch = reader.get_train_progress( - ) + current_example, current_epoch = reader.get_train_progress() time_end = time.time() used_time = time_end - time_begin print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " - "f1: %f, precision: %f, recall: %f, speed: %f steps/s" % - (current_epoch, current_example, num_train_examples, - steps, outputs["loss"], outputs["f1"], - outputs["precision"], outputs["recall"], - args.skip_steps / used_time)) + "f1: %f, precision: %f, recall: %f, speed: %f steps/s" + % (current_epoch, current_example, num_train_examples, + steps, outputs["loss"], outputs["f1"], + outputs["precision"], outputs["recall"], + args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: @@ -234,7 +236,8 @@ def main(args): batch_size=args.batch_size, epoch=1, shuffle=False)) - evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") + evaluate(exe, test_prog, test_pyreader, graph_vars, + args.num_labels, "dev") # evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( @@ -243,7 +246,8 @@ def main(args): batch_size=args.batch_size, epoch=1, shuffle=False)) - evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "test") + evaluate(exe, test_prog, test_pyreader, graph_vars, + args.num_labels, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) @@ -255,7 +259,9 @@ def main(args): if args.do_val: test_pyreader.decorate_tensor_provider( reader.data_generator( - args.dev_set, batch_size=args.batch_size, epoch=1, + args.dev_set, + batch_size=args.batch_size, + epoch=1, shuffle=False)) print("Final validation result:") evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels, "dev") diff --git a/ERNIE/train.py b/ERNIE/train.py index 470d34bc94a01d16629784019ed2562bc86b6e82..e696c896cd071e756209cc5b62c610e53316db32 100644 --- a/ERNIE/train.py +++ b/ERNIE/train.py @@ -35,8 +35,10 @@ from utils.init import init_checkpoint, init_pretraining_params from pretrain_args import parser args = parser.parse_args() + # yapf: enable. + def create_model(pyreader_name, ernie_config): pyreader = fluid.layers.py_reader( capacity=70, @@ -224,8 +226,7 @@ def train(args): print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ - trainer_id:{}" - .format(worker_endpoints, trainers_num, + trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. @@ -319,13 +320,14 @@ def train(args): epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) - print("epoch: %d, progress: %d/%d, step: %d, loss: %f, " - "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" - % (epoch, current_file_index, total_file, steps, - np.mean(np.array(cost)), - np.mean(np.exp(np.array(lm_cost))), - np.mean(np.array(acc)), skip_steps / used_time, - current_file, mask_type)) + print( + "epoch: %d, progress: %d/%d, step: %d, loss: %f, " + "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" + % (epoch, current_file_index, total_file, steps, + np.mean(np.array(cost)), + np.mean(np.exp(np.array(lm_cost))), + np.mean(np.array(acc)), skip_steps / used_time, + current_file, mask_type)) cost = [] lm_cost = [] acc = [] @@ -341,8 +343,7 @@ def train(args): print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % - (epoch, steps, - np.mean(np.array(vali_cost) / vali_steps), + (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed))