diff --git a/PaddleNLP/PaddleLARK/BERT/README.md b/PaddleNLP/PaddleLARK/BERT/README.md index 626f382aa37994b72ffc92921bf6b7a77f761278..7ed1d28bca0f5454995b6e67951894b7430c20bc 100644 --- a/PaddleNLP/PaddleLARK/BERT/README.md +++ b/PaddleNLP/PaddleLARK/BERT/README.md @@ -70,7 +70,7 @@ ``` ## 安装 -本项目依赖于 Paddle Fluid **1.5.1** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 +本项目依赖于 Paddle Fluid **1.6.0** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 ## 预训练 diff --git a/PaddleNLP/PaddleLARK/BERT/batching.py b/PaddleNLP/PaddleLARK/BERT/batching.py index f8cc76b9646e82fb9862d3c346078f2460dee82f..7a214700a9e2db27900602c235c32e435e7b85fb 100644 --- a/PaddleNLP/PaddleLARK/BERT/batching.py +++ b/PaddleNLP/PaddleLARK/BERT/batching.py @@ -155,7 +155,7 @@ def pad_batch_data(insts, inst_data = np.array([ list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts ]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + return_list += [inst_data.astype("int64").reshape([-1, max_len])] # position data if return_pos: @@ -164,7 +164,7 @@ def pad_batch_data(insts, for inst in insts ]) - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + return_list += [inst_pos.astype("int64").reshape([-1, max_len])] if return_input_mask: # This is used to avoid attention on paddings. diff --git a/PaddleNLP/PaddleLARK/BERT/model/bert.py b/PaddleNLP/PaddleLARK/BERT/model/bert.py index c17803caed17e81fafd55f9b9ae9f2b539f9f39c..f3f8c0941c8a5adc52f7b543447cf24823d77fc6 100644 --- a/PaddleNLP/PaddleLARK/BERT/model/bert.py +++ b/PaddleNLP/PaddleLARK/BERT/model/bert.py @@ -82,21 +82,21 @@ class BertModel(object): def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding( + emb_out = fluid.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) - position_emb_out = fluid.layers.embedding( + position_emb_out = fluid.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._pos_emb_name, initializer=self._param_initializer)) - sent_emb_out = fluid.layers.embedding( + sent_emb_out = fluid.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, @@ -148,6 +148,7 @@ class BertModel(object): input=self._enc_out, axes=[1], starts=[0], ends=[1]) next_sent_feat = fluid.layers.fc( input=next_sent_feat, + num_flatten_dims=2, size=self._emb_size, act="tanh", param_attr=fluid.ParamAttr( @@ -209,11 +210,14 @@ class BertModel(object): next_sent_fc_out = fluid.layers.fc( input=next_sent_feat, + num_flatten_dims=2, size=2, param_attr=fluid.ParamAttr( name="next_sent_fc.w_0", initializer=self._param_initializer), bias_attr="next_sent_fc.b_0") + next_sent_fc_out = fluid.layers.reshape( + next_sent_fc_out, [-1, 2], inplace=True) next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( logits=next_sent_fc_out, label=labels, return_softmax=True) diff --git a/PaddleNLP/PaddleLARK/BERT/model/classifier.py b/PaddleNLP/PaddleLARK/BERT/model/classifier.py index ee4e8751725d0fc5e3a1bff9fcb57ec799391d8d..03bfb7aa1504253399b5bdd4d1f4fe9c41cad27b 100644 --- a/PaddleNLP/PaddleLARK/BERT/model/classifier.py +++ b/PaddleNLP/PaddleLARK/BERT/model/classifier.py @@ -25,15 +25,14 @@ from model.bert import BertModel def create_model(args, bert_config, num_labels, is_prediction=False): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], - 'shapes': - [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [None, args.max_seq_len, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ - fluid.layers.data( + fluid.data( name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], @@ -42,7 +41,8 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ] (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs - pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) + data_loader = fluid.io.DataLoader.from_generator( + feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, @@ -59,6 +59,7 @@ def create_model(args, bert_config, num_labels, is_prediction=False): dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, + num_flatten_dims=2, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", @@ -71,8 +72,9 @@ def create_model(args, bert_config, num_labels, is_prediction=False): feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] - return pyreader, probs, feed_targets_name + return data_loader, probs, feed_targets_name + logits = fluid.layers.reshape(logits, [-1, num_labels], inplace=True) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) @@ -80,4 +82,4 @@ def create_model(args, bert_config, num_labels, is_prediction=False): num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) - return pyreader, loss, probs, accuracy, num_seqs + return data_loader, loss, probs, accuracy, num_seqs diff --git a/PaddleNLP/PaddleLARK/BERT/optimization.py b/PaddleNLP/PaddleLARK/BERT/optimization.py index c0795018a883832b49660024aaa0ccac489c4a11..0771ab77922dea90104ff67ab201bfb307212637 100644 --- a/PaddleNLP/PaddleLARK/BERT/optimization.py +++ b/PaddleNLP/PaddleLARK/BERT/optimization.py @@ -73,9 +73,10 @@ def optimization(loss, .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) else: - printf( - "WARNING: noam decay should have postive warmup steps, using " - "constant learning rate instead!") + print( + "WARNING: noam decay of learning rate should have postive warmup " + "steps but given {}, using constant learning rate instead!" + .format(warmup_steps)) scheduled_lr = fluid.layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], @@ -83,8 +84,20 @@ def optimization(loss, dtype='float32', persistable=True) elif scheduler == 'linear_warmup_decay': - scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, - num_train_steps) + if warmup_steps > 0: + scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, + num_train_steps) + else: + print( + "WARNING: linear warmup decay of learning rate should have " + "postive warmup steps but given {}, use constant learning rate " + "instead!".format(warmup_steps)) + scheduled_lr = fluid.layers.create_global_var( + name=fluid.unique_name.generate("learning_rate"), + shape=[1], + value=learning_rate, + dtype='float32', + persistable=True) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") diff --git a/PaddleNLP/PaddleLARK/BERT/predict_classifier.py b/PaddleNLP/PaddleLARK/BERT/predict_classifier.py index 0f0f842865549b6c6c2ba97dce37f9007e39c697..2e54bc8fb8271c2c1ccfd46180529ab1c7cf9a9c 100644 --- a/PaddleNLP/PaddleLARK/BERT/predict_classifier.py +++ b/PaddleNLP/PaddleLARK/BERT/predict_classifier.py @@ -17,6 +17,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import six +import sys +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf8') + import os import time import argparse @@ -82,7 +88,7 @@ def main(args): predict_startup = fluid.Program() with fluid.program_guard(predict_prog, predict_startup): with fluid.unique_name.guard(): - predict_pyreader, probs, feed_target_names = create_model( + predict_data_loader, probs, feed_target_names = create_model( args, bert_config=bert_config, num_labels=num_labels, @@ -112,11 +118,11 @@ def main(args): predict_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=predict_prog) - predict_pyreader.decorate_batch_generator( + predict_data_loader.set_batch_generator( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) - predict_pyreader.start() + predict_data_loader.start() all_results = [] time_begin = time.time() while True: @@ -124,7 +130,7 @@ def main(args): results = predict_exe.run(fetch_list=[probs.name]) all_results.extend(results[0]) except fluid.core.EOFException: - predict_pyreader.reset() + predict_data_loader.reset() break time_end = time.time() diff --git a/PaddleNLP/PaddleLARK/BERT/reader/squad.py b/PaddleNLP/PaddleLARK/BERT/reader/squad.py index fb9ec657be6cb12ad86ed7afbcd8dab135ade2b3..a3b965f6cfb4ce83c0c79a6af2612e3591f999d5 100644 --- a/PaddleNLP/PaddleLARK/BERT/reader/squad.py +++ b/PaddleNLP/PaddleLARK/BERT/reader/squad.py @@ -307,32 +307,33 @@ def convert_examples_to_features( end_position = 0 if example_index < 3: - print("*** Example ***") - print("unique_id: %s" % (unique_id)) - print("example_index: %s" % (example_index)) - print("doc_span_index: %s" % (doc_span_index)) - print("tokens: %s" % " ".join( + print(u"*** Example ***") + print(u"unique_id: %s" % (unique_id)) + print(u"example_index: %s" % (example_index)) + print(u"doc_span_index: %s" % (doc_span_index)) + print(u"tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) - print("token_to_orig_map: %s" % " ".join([ + print(u"token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) - print("token_is_max_context: %s" % " ".join([ + print(u"token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) - print("input_ids: %s" % " ".join([str(x) for x in input_ids])) - print("input_mask: %s" % " ".join([str(x) for x in input_mask])) - print("segment_ids: %s" % + print(u"input_ids: %s" % " ".join([str(x) for x in input_ids])) + print(u"input_mask: %s" % + " ".join([str(x) for x in input_mask])) + print(u"segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and example.is_impossible: - print("impossible example") + print(u"impossible example") if is_training and not example.is_impossible: answer_text = " ".join(tokens[start_position:(end_position + 1)]) - print("start_position: %d" % (start_position)) - print("end_position: %d" % (end_position)) - print("answer: %s" % + print(u"start_position: %d" % (start_position)) + print(u"end_position: %d" % (end_position)) + print(u"answer: %s" % (tokenization.printable_text(answer_text))) feature = InputFeatures( @@ -825,7 +826,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): start_position = tok_text.find(pred_text) if start_position == -1: if verbose: - print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + print(u"Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -834,7 +835,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): if len(orig_ns_text) != len(tok_ns_text): if verbose: - print("Length not equal after stripping spaces: '%s' vs '%s'", + print(u"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text @@ -852,7 +853,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): if orig_start_position is None: if verbose: - print("Couldn't map start position") + print(u"Couldn't map start position") return orig_text orig_end_position = None @@ -863,7 +864,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): if orig_end_position is None: if verbose: - print("Couldn't map end position") + print(u"Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] diff --git a/PaddleNLP/PaddleLARK/BERT/run_classifier.py b/PaddleNLP/PaddleLARK/BERT/run_classifier.py index 81a8becc5f6947fd571abf77bb83e683aaec168d..1e769d3446da2c92f27ae0f13f31db3b2f30735a 100644 --- a/PaddleNLP/PaddleLARK/BERT/run_classifier.py +++ b/PaddleNLP/PaddleLARK/BERT/run_classifier.py @@ -17,9 +17,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import six import sys -reload(sys) -sys.setdefaultencoding('utf8') +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf8') import os import time @@ -107,8 +109,8 @@ args = parser.parse_args() # yapf: enable. -def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): - test_pyreader.start() +def evaluate(exe, test_program, test_data_loader, fetch_list, eval_phase): + test_data_loader.start() total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: @@ -119,7 +121,7 @@ def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) except fluid.core.EOFException: - test_pyreader.reset() + test_data_loader.reset() break time_end = time.time() print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" % @@ -203,7 +205,7 @@ def main(args): with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): - train_pyreader, loss, probs, accuracy, num_seqs = create_model( + train_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) @@ -224,28 +226,17 @@ def main(args): incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) - if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): - dev_pyreader, loss, probs, accuracy, num_seqs = create_model( + dev_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) dev_prog = dev_prog.clone(for_test=True) - dev_pyreader.decorate_batch_generator( + dev_data_loader.set_batch_generator( processor.data_generator( batch_size=args.batch_size, phase='dev', @@ -257,13 +248,13 @@ def main(args): test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, loss, probs, accuracy, num_seqs = create_model( + test_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) - test_pyreader.decorate_batch_generator( + test_data_loader.set_batch_generator( processor.data_generator( batch_size=args.batch_size, phase='test', @@ -316,11 +307,11 @@ def main(args): train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) - train_pyreader.decorate_batch_generator(train_data_generator, place) + train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: - train_pyreader.start() + train_data_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() @@ -350,7 +341,7 @@ def main(args): total_num_seqs.extend(np_num_seqs) if args.verbose: - verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] if args.use_fp16: @@ -386,18 +377,18 @@ def main(args): throughput = [] # evaluate dev set if args.do_val: - evaluate(exe, dev_prog, dev_pyreader, + evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: - evaluate(exe, test_prog, test_pyreader, + evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) - train_pyreader.reset() + train_data_loader.reset() break if args.enable_ce: card_num = get_cards() @@ -421,13 +412,13 @@ def main(args): # final eval on dev set if args.do_val: print("Final validation result:") - evaluate(exe, dev_prog, dev_pyreader, + evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: print("Final test result:") - evaluate(exe, test_prog, test_pyreader, + evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") diff --git a/PaddleNLP/PaddleLARK/BERT/run_squad.py b/PaddleNLP/PaddleLARK/BERT/run_squad.py index 8b906986ba72974898b3295598f9ae9d973ca9e7..fc3659b61eeb7d06c252a61f83c99fbe3adfea97 100644 --- a/PaddleNLP/PaddleLARK/BERT/run_squad.py +++ b/PaddleNLP/PaddleLARK/BERT/run_squad.py @@ -17,9 +17,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import six import sys -reload(sys) -sys.setdefaultencoding('utf8') +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf8') import argparse import collections @@ -108,9 +110,8 @@ def create_model(bert_config, is_training=False): if is_training: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [None, args.max_seq_len, 1], [None, 1], [None, 1]], 'dtypes': [ 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0, 0], @@ -118,20 +119,19 @@ def create_model(bert_config, is_training=False): else: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [None, args.max_seq_len, 1], [None, 1]], 'dtypes': [ 'int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } - inputs = [fluid.layers.data(name=input_fields['names'][i], + inputs = [fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] - pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) + data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) if is_training: (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs @@ -176,23 +176,23 @@ def create_model(bert_config, is_training=False): start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 - return pyreader, total_loss, num_seqs + return data_loader, total_loss, num_seqs else: - return pyreader, unique_id, start_logits, end_logits, num_seqs + return data_loader, unique_id, start_logits, end_logits, num_seqs RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) -def predict(test_exe, test_program, test_pyreader, fetch_list, processor): +def predict(test_exe, test_program, test_data_loader, fetch_list, processor): if not os.path.exists(args.checkpoints): os.makedirs(args.checkpoints) output_prediction_file = os.path.join(args.checkpoints, "predictions.json") output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json") - test_pyreader.start() + test_data_loader.start() all_results = [] time_begin = time.time() while True: @@ -211,7 +211,7 @@ def predict(test_exe, test_program, test_pyreader, fetch_list, processor): start_logits=start_logits, end_logits=end_logits)) except fluid.core.EOFException: - test_pyreader.reset() + test_data_loader.reset() break time_end = time.time() @@ -279,7 +279,7 @@ def train(args): train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): - train_pyreader, loss, num_seqs = create_model( + train_data_loader, loss, num_seqs = create_model( bert_config=bert_config, is_training=True) @@ -300,22 +300,11 @@ def train(args): incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) - if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( + test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model( bert_config=bert_config, is_training=False) @@ -359,9 +348,9 @@ def train(args): train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, exec_strategy=exec_strategy) - train_pyreader.decorate_batch_generator(train_data_generator, place) + train_data_loader.set_batch_generator(train_data_generator, place) - train_pyreader.start() + train_data_loader.start() steps = 0 total_cost, total_num_seqs = [], [] time_begin = time.time() @@ -387,7 +376,7 @@ def train(args): total_num_seqs.extend(np_num_seqs) if args.verbose: - verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( + verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f " % np_lr[0] if args.use_fp16: @@ -414,11 +403,11 @@ def train(args): save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") fluid.io.save_persistables(exe, save_path, train_program) - train_pyreader.reset() + train_data_loader.reset() break if args.do_predict: - test_pyreader.decorate_batch_generator( + test_data_loader.set_batch_generator( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, @@ -427,7 +416,7 @@ def train(args): dev_count=1, epoch=1), place) - predict(exe, test_prog, test_pyreader, [ + predict(exe, test_prog, test_data_loader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name ], processor) diff --git a/PaddleNLP/PaddleLARK/BERT/train.py b/PaddleNLP/PaddleLARK/BERT/train.py index 43642866e8519b17397c4a07868a7b924374b9b4..f9a1fb49ef4fda9957f27e90b7a18cea6f6d4e4b 100644 --- a/PaddleNLP/PaddleLARK/BERT/train.py +++ b/PaddleNLP/PaddleLARK/BERT/train.py @@ -17,13 +17,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import six import sys -reload(sys) -sys.setdefaultencoding('utf8') +if six.PY2: + reload(sys) + sys.setdefaultencoding('utf8') import os import time -import sys import argparse import numpy as np import multiprocessing @@ -98,21 +99,20 @@ args = parser.parse_args() def create_model(bert_config): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'mask_label', 'mask_pos', 'labels'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [None, None, 1], [None, 1], [None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0, 0, 0], } - inputs = [fluid.layers.data(name=input_fields['names'][i], + inputs = [fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = inputs - pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) + data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, @@ -126,14 +126,14 @@ def create_model(bert_config): next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output( mask_label, mask_pos, labels) - return pyreader, next_sent_acc, mask_lm_loss, total_loss + return data_loader, next_sent_acc, mask_lm_loss, total_loss def predict_wrapper(args, exe, bert_config, test_prog=None, - pyreader=None, + data_loader=None, fetch_list=None): # Context to do validation. data_path = args.test_set_dir if args.do_test else args.validation_set_dir @@ -148,7 +148,7 @@ def predict_wrapper(args, max_seq_len=args.max_seq_len, is_test=True) - pyreader.decorate_batch_generator(data_reader.data_generator()) + data_loader.set_batch_generator(data_reader.data_generator()) if args.do_test: assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \ @@ -156,8 +156,8 @@ def predict_wrapper(args, init_pretraining_params(exe, args.init_checkpoint, test_prog) - def predict(exe=exe, pyreader=pyreader): - pyreader.start() + def predict(exe=exe, data_loader=data_loader): + data_loader.start() cost = 0 lm_cost = 0 @@ -176,7 +176,7 @@ def predict_wrapper(args, print("[test_set] steps: %d" % steps) except fluid.core.EOFException: - pyreader.reset() + data_loader.reset() break used_time = time.time() - time_begin @@ -193,7 +193,7 @@ def test(args): test_startup = fluid.Program() with fluid.program_guard(test_prog, test_startup): with fluid.unique_name.guard(): - test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) @@ -207,7 +207,7 @@ def test(args): exe, bert_config, test_prog=test_prog, - pyreader=test_pyreader, + data_loader=test_data_loader, fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name]) print("test begin") @@ -228,7 +228,7 @@ def train(args): startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): - train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, @@ -250,7 +250,7 @@ def train(args): test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): - test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( + test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) @@ -263,16 +263,6 @@ def train(args): dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 @@ -345,13 +335,13 @@ def train(args): exe, bert_config, test_prog=test_prog, - pyreader=test_pyreader, + data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) - train_pyreader.decorate_batch_generator(data_reader.data_generator()) - train_pyreader.start() + train_data_loader.set_batch_generator(data_reader.data_generator()) + train_data_loader.start() steps = 0 cost = [] lm_cost = [] @@ -402,7 +392,7 @@ def train(args): epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: - verbose = "feed_queue size: %d, " %train_pyreader.queue.size() + verbose = "feed_queue size: %d, " %train_data_loader.queue.size() verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] @@ -437,7 +427,7 @@ def train(args): np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: - train_pyreader.reset() + train_data_loader.reset() break if __name__ == '__main__': diff --git a/PaddleNLP/PaddleLARK/BERT/utils/fp16.py b/PaddleNLP/PaddleLARK/BERT/utils/fp16.py index 740add267dff2dbf463032bcc47a6741ca9f7c43..c68e1aceeeafdba34516769e380ca702c752946f 100644 --- a/PaddleNLP/PaddleLARK/BERT/utils/fp16.py +++ b/PaddleNLP/PaddleLARK/BERT/utils/fp16.py @@ -119,10 +119,11 @@ def create_master_params_grads(params_grads, main_prog, startup_prog, def master_param_to_train_param(master_params_grads, params_grads, main_prog): for idx, m_p_g in enumerate(master_params_grads): train_p, _ = params_grads[idx] - if train_p.name.find("layer_norm") > -1: - continue with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): - append_cast_op(m_p_g[0], train_p, main_prog) + if train_p.name.find("layer_norm") > -1: + fluid.layers.assign(m_p_g[0], train_p) + else: + append_cast_op(m_p_g[0], train_p, main_prog) def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,