diff --git a/PaddleNLP/language_representations_kit/BERT/batching.py b/PaddleNLP/language_representations_kit/BERT/batching.py index f8cc76b9646e82fb9862d3c346078f2460dee82f..7a214700a9e2db27900602c235c32e435e7b85fb 100644 --- a/PaddleNLP/language_representations_kit/BERT/batching.py +++ b/PaddleNLP/language_representations_kit/BERT/batching.py @@ -155,7 +155,7 @@ def pad_batch_data(insts, inst_data = np.array([ list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts ]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] + return_list += [inst_data.astype("int64").reshape([-1, max_len])] # position data if return_pos: @@ -164,7 +164,7 @@ def pad_batch_data(insts, for inst in insts ]) - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] + return_list += [inst_pos.astype("int64").reshape([-1, max_len])] if return_input_mask: # This is used to avoid attention on paddings. diff --git a/PaddleNLP/language_representations_kit/BERT/model/bert.py b/PaddleNLP/language_representations_kit/BERT/model/bert.py index c17803caed17e81fafd55f9b9ae9f2b539f9f39c..f3f8c0941c8a5adc52f7b543447cf24823d77fc6 100644 --- a/PaddleNLP/language_representations_kit/BERT/model/bert.py +++ b/PaddleNLP/language_representations_kit/BERT/model/bert.py @@ -82,21 +82,21 @@ class BertModel(object): def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding( + emb_out = fluid.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) - position_emb_out = fluid.layers.embedding( + position_emb_out = fluid.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._pos_emb_name, initializer=self._param_initializer)) - sent_emb_out = fluid.layers.embedding( + sent_emb_out = fluid.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, @@ -148,6 +148,7 @@ class BertModel(object): input=self._enc_out, axes=[1], starts=[0], ends=[1]) next_sent_feat = fluid.layers.fc( input=next_sent_feat, + num_flatten_dims=2, size=self._emb_size, act="tanh", param_attr=fluid.ParamAttr( @@ -209,11 +210,14 @@ class BertModel(object): next_sent_fc_out = fluid.layers.fc( input=next_sent_feat, + num_flatten_dims=2, size=2, param_attr=fluid.ParamAttr( name="next_sent_fc.w_0", initializer=self._param_initializer), bias_attr="next_sent_fc.b_0") + next_sent_fc_out = fluid.layers.reshape( + next_sent_fc_out, [-1, 2], inplace=True) next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( logits=next_sent_fc_out, label=labels, return_softmax=True) diff --git a/PaddleNLP/language_representations_kit/BERT/model/classifier.py b/PaddleNLP/language_representations_kit/BERT/model/classifier.py index ee4e8751725d0fc5e3a1bff9fcb57ec799391d8d..186d543184d1bc35722c58484cfee2a5ae983a70 100644 --- a/PaddleNLP/language_representations_kit/BERT/model/classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/model/classifier.py @@ -25,9 +25,8 @@ from model.bert import BertModel def create_model(args, bert_config, num_labels, is_prediction=False): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], - 'shapes': - [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [-1, args.max_seq_len, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } @@ -59,6 +58,7 @@ def create_model(args, bert_config, num_labels, is_prediction=False): dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, + num_flatten_dims=2, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", @@ -73,6 +73,7 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ] return pyreader, probs, feed_targets_name + logits = fluid.layers.reshape(logits, [-1, num_labels], inplace=True) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) diff --git a/PaddleNLP/language_representations_kit/BERT/run_classifier.py b/PaddleNLP/language_representations_kit/BERT/run_classifier.py index 81a8becc5f6947fd571abf77bb83e683aaec168d..3daa819ba5252b6b37669afea9d8de837ba8d400 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/run_classifier.py @@ -224,17 +224,6 @@ def main(args): incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) - if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): diff --git a/PaddleNLP/language_representations_kit/BERT/run_squad.py b/PaddleNLP/language_representations_kit/BERT/run_squad.py index 8b906986ba72974898b3295598f9ae9d973ca9e7..7ae937a3e9a298d7003f6aa5fecbf9afefbcc8f5 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_squad.py +++ b/PaddleNLP/language_representations_kit/BERT/run_squad.py @@ -108,8 +108,7 @@ def create_model(bert_config, is_training=False): if is_training: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], + 'shapes': [[None, None], [None, None], [None, None], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], 'dtypes': [ 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], @@ -118,8 +117,7 @@ def create_model(bert_config, is_training=False): else: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], + 'shapes': [[None, None], [None, None], [None, None], [-1, args.max_seq_len, 1], [-1, 1]], 'dtypes': [ 'int64', 'int64', 'int64', 'float32', 'int64'], @@ -300,17 +298,6 @@ def train(args): incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) - if args.do_predict: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): diff --git a/PaddleNLP/language_representations_kit/BERT/train.py b/PaddleNLP/language_representations_kit/BERT/train.py index 43642866e8519b17397c4a07868a7b924374b9b4..6bc985c4d4c9b28794f74b17628ecff0dfb3800d 100644 --- a/PaddleNLP/language_representations_kit/BERT/train.py +++ b/PaddleNLP/language_representations_kit/BERT/train.py @@ -98,9 +98,8 @@ args = parser.parse_args() def create_model(bert_config): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'mask_label', 'mask_pos', 'labels'], - 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]], + 'shapes': [[None, None], [None, None], [None, None], + [None, None, 1], [None, 1], [None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0, 0, 0], } @@ -263,16 +262,6 @@ def train(args): dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) - if args.verbose: - if args.in_tokens: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, - batch_size=args.batch_size // args.max_seq_len) - else: - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( - program=train_program, batch_size=args.batch_size) - print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) nccl2_num_trainers = 1 nccl2_trainer_id = 0