From e5d95cb904149ed7e036dfe99a32cc1a2566c745 Mon Sep 17 00:00:00 2001 From: chenxuyi Date: Mon, 4 Nov 2019 19:16:41 +0800 Subject: [PATCH] 1.6: DataLoader, ernie_encode + santiy check --- ernie/ernie_encoder.py | 29 ++++++++++++----------- ernie/finetune/classifier.py | 40 +++++++++++--------------------- ernie/finetune/mrc.py | 26 ++++++++++----------- ernie/finetune/sequence_label.py | 25 +++++++++----------- ernie/run_classifier.py | 8 +++---- ernie/run_mrc.py | 10 ++++---- ernie/run_sequence_labeling.py | 6 ++--- ernie/train.py | 29 +++++++++++------------ 8 files changed, 78 insertions(+), 95 deletions(-) diff --git a/ernie/ernie_encoder.py b/ernie/ernie_encoder.py index 1c47aa0..8b41374 100644 --- a/ernie/ernie_encoder.py +++ b/ernie/ernie_encoder.py @@ -52,18 +52,16 @@ run_type_g.add_arg("use_cuda", bool, True, "If set, use G def create_model(args, pyreader_name, ernie_config): - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1]], - dtypes=['int64', 'int64', 'int64', 'int64', 'float', 'int64'], - lod_levels=[0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - - (src_ids, sent_ids, pos_ids, task_ids, input_mask, - seq_lens) = fluid.layers.read_file(pyreader) + src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32') + seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64') + + pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, seq_lens], + capacity=70, + iterable=False) ernie = ErnieModel( src_ids=src_ids, @@ -143,7 +141,7 @@ def main(args): exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = dev_count - pyreader.decorate_tensor_provider(data_generator) + pyreader.set_batch_generator(data_generator) pyreader.start() total_cls_emb = [] @@ -167,6 +165,11 @@ def main(args): total_cls_emb = np.concatenate(total_cls_emb) total_top_layer_emb = np.concatenate(total_top_layer_emb) + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + else: + raise RuntimeError('output dir exists: %s' % args.output_dir) + with open(os.path.join(args.output_dir, "cls_emb.npy"), "wb") as cls_emb_file: np.save(cls_emb_file, total_cls_emb) diff --git a/ernie/finetune/classifier.py b/ernie/finetune/classifier.py index 4487e08..285cce2 100644 --- a/ernie/finetune/classifier.py +++ b/ernie/finetune/classifier.py @@ -39,34 +39,22 @@ def create_model(args, is_classify=False, is_regression=False, ernie_version="1.0"): + + src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32') + qids = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64') + if is_classify: - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], - dtypes=[ - 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0], - name=task_name + "_" + pyreader_name, - use_double_buffer=True) + labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64') elif is_regression: - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], - dtypes=[ - 'int64', 'int64', 'int64', 'int64', 'float32', 'float32', - 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0], - name=task_name + "_" + pyreader_name, - use_double_buffer=True) - - (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, - qids) = fluid.layers.read_file(pyreader) + labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='float32') + + pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, qids], + capacity=70, + iterable=False) ernie = ErnieModel( src_ids=src_ids, diff --git a/ernie/finetune/mrc.py b/ernie/finetune/mrc.py index 3a7c5e7..3cc13a8 100644 --- a/ernie/finetune/mrc.py +++ b/ernie/finetune/mrc.py @@ -40,20 +40,18 @@ import tokenization log = logging.getLogger(__name__) def create_model(args, pyreader_name, ernie_config, is_training): - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]], - dtypes=[ - 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', - 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - (src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions, - end_positions, unique_id) = fluid.layers.read_file(pyreader) + src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + task_ids= fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.layers.data(name='5', shape=[-1, 1], dtype='float32') + start_positions = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64') + end_positions = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64') + unique_id = fluid.layers.data(name='8', shape=[-1, 1], dtype='int64') + + pyreader = fluid.io.DataLoader.from_generator(feed_list=[ + src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions, + end_positions, unique_id], capacity=50, iterable=False) ernie = ErnieModel( src_ids=src_ids, diff --git a/ernie/finetune/sequence_label.py b/ernie/finetune/sequence_label.py index d517634..1c76ec1 100644 --- a/ernie/finetune/sequence_label.py +++ b/ernie/finetune/sequence_label.py @@ -36,20 +36,17 @@ from model.ernie import ErnieModel log = logging.getLogger(__name__) def create_model(args, pyreader_name, ernie_config, is_prediction=False): - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1]], - dtypes=[ - 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - - (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, - seq_lens) = fluid.layers.read_file(pyreader) + src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32') + labels = fluid.layers.data(name='7', shape=[-1, args.max_seq_len, 1], dtype='int64') + seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64') + + pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, seq_lens], + capacity=70, + iterable=False) ernie = ErnieModel( src_ids=src_ids, diff --git a/ernie/run_classifier.py b/ernie/run_classifier.py index 7e1dd82..095912b 100644 --- a/ernie/run_classifier.py +++ b/ernie/run_classifier.py @@ -228,7 +228,7 @@ def main(args): num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) - train_pyreader.decorate_tensor_provider(train_data_generator) + train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None @@ -349,7 +349,7 @@ def main(args): # final eval on dianostic, hack for glue-ax if args.diagnostic: - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( args.diagnostic, batch_size=args.batch_size, @@ -380,7 +380,7 @@ def evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, # evaluate dev set batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size for ds in args.dev_set.split(','): - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( ds, batch_size=batch_size, @@ -409,7 +409,7 @@ def predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size for test_f, save_f in zip(test_sets, save_dirs): - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( test_f, batch_size=batch_size, diff --git a/ernie/run_mrc.py b/ernie/run_mrc.py index 51e5efd..b8b4e42 100644 --- a/ernie/run_mrc.py +++ b/ernie/run_mrc.py @@ -228,7 +228,7 @@ def main(args): num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) - train_pyreader.decorate_tensor_provider(train_data_generator) + train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None @@ -272,7 +272,7 @@ def main(args): if steps % args.validation_steps == 0: if args.do_val: - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( args.dev_set, batch_size=args.batch_size, @@ -291,7 +291,7 @@ def main(args): args=args) if args.do_test: - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( args.test_set, batch_size=args.batch_size, @@ -318,7 +318,7 @@ def main(args): # final eval on dev set if args.do_val: log.info("Final validation result:") - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( args.dev_set, batch_size=args.batch_size, @@ -339,7 +339,7 @@ def main(args): # final eval on test set if args.do_test: log.info("Final test result:") - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( args.test_set, batch_size=args.batch_size, diff --git a/ernie/run_sequence_labeling.py b/ernie/run_sequence_labeling.py index ce8b27e..2896154 100644 --- a/ernie/run_sequence_labeling.py +++ b/ernie/run_sequence_labeling.py @@ -217,7 +217,7 @@ def main(args): num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) - train_pyreader.decorate_tensor_provider(train_data_generator) + train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None @@ -302,7 +302,7 @@ def evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, # evaluate dev set batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size for ds in args.dev_set.split(','): #single card eval - test_pyreader.decorate_tensor_provider( + test_pyreader.set_batch_generator( reader.data_generator( ds, batch_size=batch_size, @@ -324,7 +324,7 @@ def predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size for test_f, save_f in zip(test_sets, save_dirs): - test_pyreader.decorate_tensor_provider(reader.data_generator( + test_pyreader.set_batch_generator(reader.data_generator( test_f, batch_size=batch_size, epoch=1, diff --git a/ernie/train.py b/ernie/train.py index dd9ebc8..708afdd 100644 --- a/ernie/train.py +++ b/ernie/train.py @@ -41,20 +41,17 @@ args = parser.parse_args() def create_model(pyreader_name, ernie_config): - pyreader = fluid.layers.py_reader( - capacity=70, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], - [-1, 1], [-1, 1]], - dtypes=[ - 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - - (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, - labels) = fluid.layers.read_file(pyreader) + src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32') + mask_label = fluid.layers.data(name='5', shape=[-1, 1], dtype='int64') + mask_pos = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64') + labels = fluid.layers.data(name='r', shape=[-1, 1], dtype='int64') + + pyreader = fluid.io.DataLoader.from_generator(feed_list=[ + src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels + ], capacity=70, iterable=False) ernie = ErnieModel( src_ids=src_ids, @@ -97,7 +94,7 @@ def predict_wrapper(args, def predict(exe=exe, pyreader=pyreader): - pyreader.decorate_tensor_provider(data_reader.data_generator()) + pyreader.set_batch_generator(data_reader.data_generator()) pyreader.start() cost = 0 @@ -285,7 +282,7 @@ def train(args): next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) - train_pyreader.decorate_tensor_provider(data_reader.data_generator()) + train_pyreader.set_batch_generator(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] -- GitLab