未验证 提交 8b7d837b 编写于 作者: Y Yibing Liu 提交者: GitHub

Update bert to 1.6 api (#3849)

上级 f39f5776
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
``` ```
## 安装 ## 安装
本项目依赖于 Paddle Fluid **1.5.1** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 本项目依赖于 Paddle Fluid **1.6.0** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。
## 预训练 ## 预训练
......
...@@ -155,7 +155,7 @@ def pad_batch_data(insts, ...@@ -155,7 +155,7 @@ def pad_batch_data(insts,
inst_data = np.array([ inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
]) ])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] return_list += [inst_data.astype("int64").reshape([-1, max_len])]
# position data # position data
if return_pos: if return_pos:
...@@ -164,7 +164,7 @@ def pad_batch_data(insts, ...@@ -164,7 +164,7 @@ def pad_batch_data(insts,
for inst in insts for inst in insts
]) ])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
if return_input_mask: if return_input_mask:
# This is used to avoid attention on paddings. # This is used to avoid attention on paddings.
......
...@@ -82,21 +82,21 @@ class BertModel(object): ...@@ -82,21 +82,21 @@ class BertModel(object):
def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
# padding id in vocabulary must be set to 0 # padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding( emb_out = fluid.embedding(
input=src_ids, input=src_ids,
size=[self._voc_size, self._emb_size], size=[self._voc_size, self._emb_size],
dtype=self._dtype, dtype=self._dtype,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer), name=self._word_emb_name, initializer=self._param_initializer),
is_sparse=False) is_sparse=False)
position_emb_out = fluid.layers.embedding( position_emb_out = fluid.embedding(
input=position_ids, input=position_ids,
size=[self._max_position_seq_len, self._emb_size], size=[self._max_position_seq_len, self._emb_size],
dtype=self._dtype, dtype=self._dtype,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer)) name=self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding( sent_emb_out = fluid.embedding(
sentence_ids, sentence_ids,
size=[self._sent_types, self._emb_size], size=[self._sent_types, self._emb_size],
dtype=self._dtype, dtype=self._dtype,
...@@ -148,6 +148,7 @@ class BertModel(object): ...@@ -148,6 +148,7 @@ class BertModel(object):
input=self._enc_out, axes=[1], starts=[0], ends=[1]) input=self._enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.fc( next_sent_feat = fluid.layers.fc(
input=next_sent_feat, input=next_sent_feat,
num_flatten_dims=2,
size=self._emb_size, size=self._emb_size,
act="tanh", act="tanh",
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
...@@ -209,11 +210,14 @@ class BertModel(object): ...@@ -209,11 +210,14 @@ class BertModel(object):
next_sent_fc_out = fluid.layers.fc( next_sent_fc_out = fluid.layers.fc(
input=next_sent_feat, input=next_sent_feat,
num_flatten_dims=2,
size=2, size=2,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="next_sent_fc.w_0", initializer=self._param_initializer), name="next_sent_fc.w_0", initializer=self._param_initializer),
bias_attr="next_sent_fc.b_0") bias_attr="next_sent_fc.b_0")
next_sent_fc_out = fluid.layers.reshape(
next_sent_fc_out, [-1, 2], inplace=True)
next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
logits=next_sent_fc_out, label=labels, return_softmax=True) logits=next_sent_fc_out, label=labels, return_softmax=True)
......
...@@ -25,15 +25,14 @@ from model.bert import BertModel ...@@ -25,15 +25,14 @@ from model.bert import BertModel
def create_model(args, bert_config, num_labels, is_prediction=False): def create_model(args, bert_config, num_labels, is_prediction=False):
input_fields = { input_fields = {
'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'],
'shapes': 'shapes': [[None, None], [None, None], [None, None],
[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [None, args.max_seq_len, 1], [None, 1]],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'],
'lod_levels': [0, 0, 0, 0, 0], 'lod_levels': [0, 0, 0, 0, 0],
} }
inputs = [ inputs = [
fluid.layers.data( fluid.data(
name=input_fields['names'][i], name=input_fields['names'][i],
shape=input_fields['shapes'][i], shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i], dtype=input_fields['dtypes'][i],
...@@ -42,7 +41,8 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ...@@ -42,7 +41,8 @@ def create_model(args, bert_config, num_labels, is_prediction=False):
] ]
(src_ids, pos_ids, sent_ids, input_mask, labels) = inputs (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs
pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) data_loader = fluid.io.DataLoader.from_generator(
feed_list=inputs, capacity=50, iterable=False)
bert = BertModel( bert = BertModel(
src_ids=src_ids, src_ids=src_ids,
...@@ -59,6 +59,7 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ...@@ -59,6 +59,7 @@ def create_model(args, bert_config, num_labels, is_prediction=False):
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=cls_feats, input=cls_feats,
num_flatten_dims=2,
size=num_labels, size=num_labels,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name="cls_out_w", name="cls_out_w",
...@@ -71,8 +72,9 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ...@@ -71,8 +72,9 @@ def create_model(args, bert_config, num_labels, is_prediction=False):
feed_targets_name = [ feed_targets_name = [
src_ids.name, pos_ids.name, sent_ids.name, input_mask.name src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
] ]
return pyreader, probs, feed_targets_name return data_loader, probs, feed_targets_name
logits = fluid.layers.reshape(logits, [-1, num_labels], inplace=True)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy( ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True) logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss) loss = fluid.layers.mean(x=ce_loss)
...@@ -80,4 +82,4 @@ def create_model(args, bert_config, num_labels, is_prediction=False): ...@@ -80,4 +82,4 @@ def create_model(args, bert_config, num_labels, is_prediction=False):
num_seqs = fluid.layers.create_tensor(dtype='int64') num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
return pyreader, loss, probs, accuracy, num_seqs return data_loader, loss, probs, accuracy, num_seqs
...@@ -73,9 +73,10 @@ def optimization(loss, ...@@ -73,9 +73,10 @@ def optimization(loss,
.noam_decay(1/(warmup_steps *(learning_rate ** 2)), .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps) warmup_steps)
else: else:
printf( print(
"WARNING: noam decay should have postive warmup steps, using " "WARNING: noam decay of learning rate should have postive warmup "
"constant learning rate instead!") "steps but given {}, using constant learning rate instead!"
.format(warmup_steps))
scheduled_lr = fluid.layers.create_global_var( scheduled_lr = fluid.layers.create_global_var(
name=fluid.unique_name.generate("learning_rate"), name=fluid.unique_name.generate("learning_rate"),
shape=[1], shape=[1],
...@@ -83,8 +84,20 @@ def optimization(loss, ...@@ -83,8 +84,20 @@ def optimization(loss,
dtype='float32', dtype='float32',
persistable=True) persistable=True)
elif scheduler == 'linear_warmup_decay': elif scheduler == 'linear_warmup_decay':
if warmup_steps > 0:
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps) num_train_steps)
else:
print(
"WARNING: linear warmup decay of learning rate should have "
"postive warmup steps but given {}, use constant learning rate "
"instead!".format(warmup_steps))
scheduled_lr = fluid.layers.create_global_var(
name=fluid.unique_name.generate("learning_rate"),
shape=[1],
value=learning_rate,
dtype='float32',
persistable=True)
else: else:
raise ValueError("Unkown learning rate scheduler, should be " raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'") "'noam_decay' or 'linear_warmup_decay'")
......
...@@ -17,6 +17,12 @@ from __future__ import absolute_import ...@@ -17,6 +17,12 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import six
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf8')
import os import os
import time import time
import argparse import argparse
...@@ -82,7 +88,7 @@ def main(args): ...@@ -82,7 +88,7 @@ def main(args):
predict_startup = fluid.Program() predict_startup = fluid.Program()
with fluid.program_guard(predict_prog, predict_startup): with fluid.program_guard(predict_prog, predict_startup):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
predict_pyreader, probs, feed_target_names = create_model( predict_data_loader, probs, feed_target_names = create_model(
args, args,
bert_config=bert_config, bert_config=bert_config,
num_labels=num_labels, num_labels=num_labels,
...@@ -112,11 +118,11 @@ def main(args): ...@@ -112,11 +118,11 @@ def main(args):
predict_exe = fluid.ParallelExecutor( predict_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda, main_program=predict_prog) use_cuda=args.use_cuda, main_program=predict_prog)
predict_pyreader.decorate_batch_generator( predict_data_loader.set_batch_generator(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) batch_size=args.batch_size, phase='test', epoch=1, shuffle=False))
predict_pyreader.start() predict_data_loader.start()
all_results = [] all_results = []
time_begin = time.time() time_begin = time.time()
while True: while True:
...@@ -124,7 +130,7 @@ def main(args): ...@@ -124,7 +130,7 @@ def main(args):
results = predict_exe.run(fetch_list=[probs.name]) results = predict_exe.run(fetch_list=[probs.name])
all_results.extend(results[0]) all_results.extend(results[0])
except fluid.core.EOFException: except fluid.core.EOFException:
predict_pyreader.reset() predict_data_loader.reset()
break break
time_end = time.time() time_end = time.time()
......
...@@ -307,32 +307,33 @@ def convert_examples_to_features( ...@@ -307,32 +307,33 @@ def convert_examples_to_features(
end_position = 0 end_position = 0
if example_index < 3: if example_index < 3:
print("*** Example ***") print(u"*** Example ***")
print("unique_id: %s" % (unique_id)) print(u"unique_id: %s" % (unique_id))
print("example_index: %s" % (example_index)) print(u"example_index: %s" % (example_index))
print("doc_span_index: %s" % (doc_span_index)) print(u"doc_span_index: %s" % (doc_span_index))
print("tokens: %s" % " ".join( print(u"tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens])) [tokenization.printable_text(x) for x in tokens]))
print("token_to_orig_map: %s" % " ".join([ print(u"token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y) "%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map) for (x, y) in six.iteritems(token_to_orig_map)
])) ]))
print("token_is_max_context: %s" % " ".join([ print(u"token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y) "%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context) for (x, y) in six.iteritems(token_is_max_context)
])) ]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids])) print(u"input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x) for x in input_mask])) print(u"input_mask: %s" %
print("segment_ids: %s" % " ".join([str(x) for x in input_mask]))
print(u"segment_ids: %s" %
" ".join([str(x) for x in segment_ids])) " ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible: if is_training and example.is_impossible:
print("impossible example") print(u"impossible example")
if is_training and not example.is_impossible: if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position + answer_text = " ".join(tokens[start_position:(end_position +
1)]) 1)])
print("start_position: %d" % (start_position)) print(u"start_position: %d" % (start_position))
print("end_position: %d" % (end_position)) print(u"end_position: %d" % (end_position))
print("answer: %s" % print(u"answer: %s" %
(tokenization.printable_text(answer_text))) (tokenization.printable_text(answer_text)))
feature = InputFeatures( feature = InputFeatures(
...@@ -825,7 +826,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): ...@@ -825,7 +826,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose):
start_position = tok_text.find(pred_text) start_position = tok_text.find(pred_text)
if start_position == -1: if start_position == -1:
if verbose: if verbose:
print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) print(u"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text return orig_text
end_position = start_position + len(pred_text) - 1 end_position = start_position + len(pred_text) - 1
...@@ -834,7 +835,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): ...@@ -834,7 +835,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose):
if len(orig_ns_text) != len(tok_ns_text): if len(orig_ns_text) != len(tok_ns_text):
if verbose: if verbose:
print("Length not equal after stripping spaces: '%s' vs '%s'", print(u"Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text) orig_ns_text, tok_ns_text)
return orig_text return orig_text
...@@ -852,7 +853,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): ...@@ -852,7 +853,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose):
if orig_start_position is None: if orig_start_position is None:
if verbose: if verbose:
print("Couldn't map start position") print(u"Couldn't map start position")
return orig_text return orig_text
orig_end_position = None orig_end_position = None
...@@ -863,7 +864,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose): ...@@ -863,7 +864,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose):
if orig_end_position is None: if orig_end_position is None:
if verbose: if verbose:
print("Couldn't map end position") print(u"Couldn't map end position")
return orig_text return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)] output_text = orig_text[orig_start_position:(orig_end_position + 1)]
......
...@@ -17,9 +17,11 @@ from __future__ import absolute_import ...@@ -17,9 +17,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import six
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf8') reload(sys)
sys.setdefaultencoding('utf8')
import os import os
import time import time
...@@ -107,8 +109,8 @@ args = parser.parse_args() ...@@ -107,8 +109,8 @@ args = parser.parse_args()
# yapf: enable. # yapf: enable.
def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): def evaluate(exe, test_program, test_data_loader, fetch_list, eval_phase):
test_pyreader.start() test_data_loader.start()
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
while True: while True:
...@@ -119,7 +121,7 @@ def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): ...@@ -119,7 +121,7 @@ def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase):
total_acc.extend(np_acc * np_num_seqs) total_acc.extend(np_acc * np_num_seqs)
total_num_seqs.extend(np_num_seqs) total_num_seqs.extend(np_num_seqs)
except fluid.core.EOFException: except fluid.core.EOFException:
test_pyreader.reset() test_data_loader.reset()
break break
time_end = time.time() time_end = time.time()
print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" % print("[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s" %
...@@ -203,7 +205,7 @@ def main(args): ...@@ -203,7 +205,7 @@ def main(args):
with fluid.program_guard(train_program, startup_prog): with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
train_pyreader, loss, probs, accuracy, num_seqs = create_model( train_data_loader, loss, probs, accuracy, num_seqs = create_model(
args, args,
bert_config=bert_config, bert_config=bert_config,
num_labels=num_labels) num_labels=num_labels)
...@@ -224,28 +226,17 @@ def main(args): ...@@ -224,28 +226,17 @@ def main(args):
incr_ratio=args.incr_ratio, incr_ratio=args.incr_ratio,
decr_ratio=args.decr_ratio) decr_ratio=args.decr_ratio)
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_val: if args.do_val:
dev_prog = fluid.Program() dev_prog = fluid.Program()
with fluid.program_guard(dev_prog, startup_prog): with fluid.program_guard(dev_prog, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
dev_pyreader, loss, probs, accuracy, num_seqs = create_model( dev_data_loader, loss, probs, accuracy, num_seqs = create_model(
args, args,
bert_config=bert_config, bert_config=bert_config,
num_labels=num_labels) num_labels=num_labels)
dev_prog = dev_prog.clone(for_test=True) dev_prog = dev_prog.clone(for_test=True)
dev_pyreader.decorate_batch_generator( dev_data_loader.set_batch_generator(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='dev', phase='dev',
...@@ -257,13 +248,13 @@ def main(args): ...@@ -257,13 +248,13 @@ def main(args):
test_prog = fluid.Program() test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog): with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
test_pyreader, loss, probs, accuracy, num_seqs = create_model( test_data_loader, loss, probs, accuracy, num_seqs = create_model(
args, args,
bert_config=bert_config, bert_config=bert_config,
num_labels=num_labels) num_labels=num_labels)
test_prog = test_prog.clone(for_test=True) test_prog = test_prog.clone(for_test=True)
test_pyreader.decorate_batch_generator( test_data_loader.set_batch_generator(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='test', phase='test',
...@@ -316,11 +307,11 @@ def main(args): ...@@ -316,11 +307,11 @@ def main(args):
train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy) loss_name=loss.name, build_strategy=build_strategy)
train_pyreader.decorate_batch_generator(train_data_generator, place) train_data_loader.set_batch_generator(train_data_generator, place)
if args.do_train: if args.do_train:
train_pyreader.start() train_data_loader.start()
steps = 0 steps = 0
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
...@@ -350,7 +341,7 @@ def main(args): ...@@ -350,7 +341,7 @@ def main(args):
total_num_seqs.extend(np_num_seqs) total_num_seqs.extend(np_num_seqs)
if args.verbose: if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
) )
verbose += "learning rate: %f" % np_lr[0] verbose += "learning rate: %f" % np_lr[0]
if args.use_fp16: if args.use_fp16:
...@@ -386,18 +377,18 @@ def main(args): ...@@ -386,18 +377,18 @@ def main(args):
throughput = [] throughput = []
# evaluate dev set # evaluate dev set
if args.do_val: if args.do_val:
evaluate(exe, dev_prog, dev_pyreader, evaluate(exe, dev_prog, dev_data_loader,
[loss.name, accuracy.name, num_seqs.name], [loss.name, accuracy.name, num_seqs.name],
"dev") "dev")
# evaluate test set # evaluate test set
if args.do_test: if args.do_test:
evaluate(exe, test_prog, test_pyreader, evaluate(exe, test_prog, test_data_loader,
[loss.name, accuracy.name, num_seqs.name], [loss.name, accuracy.name, num_seqs.name],
"test") "test")
except fluid.core.EOFException: except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps)) save_path = os.path.join(args.checkpoints, "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset() train_data_loader.reset()
break break
if args.enable_ce: if args.enable_ce:
card_num = get_cards() card_num = get_cards()
...@@ -421,13 +412,13 @@ def main(args): ...@@ -421,13 +412,13 @@ def main(args):
# final eval on dev set # final eval on dev set
if args.do_val: if args.do_val:
print("Final validation result:") print("Final validation result:")
evaluate(exe, dev_prog, dev_pyreader, evaluate(exe, dev_prog, dev_data_loader,
[loss.name, accuracy.name, num_seqs.name], "dev") [loss.name, accuracy.name, num_seqs.name], "dev")
# final eval on test set # final eval on test set
if args.do_test: if args.do_test:
print("Final test result:") print("Final test result:")
evaluate(exe, test_prog, test_pyreader, evaluate(exe, test_prog, test_data_loader,
[loss.name, accuracy.name, num_seqs.name], "test") [loss.name, accuracy.name, num_seqs.name], "test")
......
...@@ -17,9 +17,11 @@ from __future__ import absolute_import ...@@ -17,9 +17,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import six
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf8') reload(sys)
sys.setdefaultencoding('utf8')
import argparse import argparse
import collections import collections
...@@ -108,9 +110,8 @@ def create_model(bert_config, is_training=False): ...@@ -108,9 +110,8 @@ def create_model(bert_config, is_training=False):
if is_training: if is_training:
input_fields = { input_fields = {
'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions'], 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions'],
'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 'shapes': [[None, None], [None, None], [None, None],
[-1, args.max_seq_len, 1], [None, args.max_seq_len, 1], [None, 1], [None, 1]],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
'dtypes': [ 'dtypes': [
'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
'lod_levels': [0, 0, 0, 0, 0, 0], 'lod_levels': [0, 0, 0, 0, 0, 0],
...@@ -118,20 +119,19 @@ def create_model(bert_config, is_training=False): ...@@ -118,20 +119,19 @@ def create_model(bert_config, is_training=False):
else: else:
input_fields = { input_fields = {
'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'],
'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 'shapes': [[None, None], [None, None], [None, None],
[-1, args.max_seq_len, 1], [None, args.max_seq_len, 1], [None, 1]],
[-1, args.max_seq_len, 1], [-1, 1]],
'dtypes': [ 'dtypes': [
'int64', 'int64', 'int64', 'float32', 'int64'], 'int64', 'int64', 'int64', 'float32', 'int64'],
'lod_levels': [0, 0, 0, 0, 0], 'lod_levels': [0, 0, 0, 0, 0],
} }
inputs = [fluid.layers.data(name=input_fields['names'][i], inputs = [fluid.data(name=input_fields['names'][i],
shape=input_fields['shapes'][i], shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i], dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))]
pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False)
if is_training: if is_training:
(src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs
...@@ -176,23 +176,23 @@ def create_model(bert_config, is_training=False): ...@@ -176,23 +176,23 @@ def create_model(bert_config, is_training=False):
start_loss = compute_loss(start_logits, start_positions) start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions) end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0 total_loss = (start_loss + end_loss) / 2.0
return pyreader, total_loss, num_seqs return data_loader, total_loss, num_seqs
else: else:
return pyreader, unique_id, start_logits, end_logits, num_seqs return data_loader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult", RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"]) ["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor): def predict(test_exe, test_program, test_data_loader, fetch_list, processor):
if not os.path.exists(args.checkpoints): if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints) os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json") output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json") output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json") output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
test_pyreader.start() test_data_loader.start()
all_results = [] all_results = []
time_begin = time.time() time_begin = time.time()
while True: while True:
...@@ -211,7 +211,7 @@ def predict(test_exe, test_program, test_pyreader, fetch_list, processor): ...@@ -211,7 +211,7 @@ def predict(test_exe, test_program, test_pyreader, fetch_list, processor):
start_logits=start_logits, start_logits=start_logits,
end_logits=end_logits)) end_logits=end_logits))
except fluid.core.EOFException: except fluid.core.EOFException:
test_pyreader.reset() test_data_loader.reset()
break break
time_end = time.time() time_end = time.time()
...@@ -279,7 +279,7 @@ def train(args): ...@@ -279,7 +279,7 @@ def train(args):
train_program = fluid.Program() train_program = fluid.Program()
with fluid.program_guard(train_program, startup_prog): with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model( train_data_loader, loss, num_seqs = create_model(
bert_config=bert_config, bert_config=bert_config,
is_training=True) is_training=True)
...@@ -300,22 +300,11 @@ def train(args): ...@@ -300,22 +300,11 @@ def train(args):
incr_ratio=args.incr_ratio, incr_ratio=args.incr_ratio,
decr_ratio=args.decr_ratio) decr_ratio=args.decr_ratio)
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict: if args.do_predict:
test_prog = fluid.Program() test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog): with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( test_data_loader, unique_ids, start_logits, end_logits, num_seqs = create_model(
bert_config=bert_config, bert_config=bert_config,
is_training=False) is_training=False)
...@@ -359,9 +348,9 @@ def train(args): ...@@ -359,9 +348,9 @@ def train(args):
train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
loss_name=loss.name, exec_strategy=exec_strategy) loss_name=loss.name, exec_strategy=exec_strategy)
train_pyreader.decorate_batch_generator(train_data_generator, place) train_data_loader.set_batch_generator(train_data_generator, place)
train_pyreader.start() train_data_loader.start()
steps = 0 steps = 0
total_cost, total_num_seqs = [], [] total_cost, total_num_seqs = [], []
time_begin = time.time() time_begin = time.time()
...@@ -387,7 +376,7 @@ def train(args): ...@@ -387,7 +376,7 @@ def train(args):
total_num_seqs.extend(np_num_seqs) total_num_seqs.extend(np_num_seqs)
if args.verbose: if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
) )
verbose += "learning rate: %f " % np_lr[0] verbose += "learning rate: %f " % np_lr[0]
if args.use_fp16: if args.use_fp16:
...@@ -414,11 +403,11 @@ def train(args): ...@@ -414,11 +403,11 @@ def train(args):
save_path = os.path.join(args.checkpoints, save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final") "step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset() train_data_loader.reset()
break break
if args.do_predict: if args.do_predict:
test_pyreader.decorate_batch_generator( test_data_loader.set_batch_generator(
processor.data_generator( processor.data_generator(
data_path=args.predict_file, data_path=args.predict_file,
batch_size=args.batch_size, batch_size=args.batch_size,
...@@ -427,7 +416,7 @@ def train(args): ...@@ -427,7 +416,7 @@ def train(args):
dev_count=1, dev_count=1,
epoch=1), place) epoch=1), place)
predict(exe, test_prog, test_pyreader, [ predict(exe, test_prog, test_data_loader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor) ], processor)
......
...@@ -17,13 +17,14 @@ from __future__ import absolute_import ...@@ -17,13 +17,14 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import six
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf8') reload(sys)
sys.setdefaultencoding('utf8')
import os import os
import time import time
import sys
import argparse import argparse
import numpy as np import numpy as np
import multiprocessing import multiprocessing
...@@ -98,21 +99,20 @@ args = parser.parse_args() ...@@ -98,21 +99,20 @@ args = parser.parse_args()
def create_model(bert_config): def create_model(bert_config):
input_fields = { input_fields = {
'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'mask_label', 'mask_pos', 'labels'], 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'mask_label', 'mask_pos', 'labels'],
'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], 'shapes': [[None, None], [None, None], [None, None],
[-1, args.max_seq_len, 1], [None, None, 1], [None, 1], [None, 1], [None, 1]],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]],
'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'],
'lod_levels': [0, 0, 0, 0, 0, 0, 0], 'lod_levels': [0, 0, 0, 0, 0, 0, 0],
} }
inputs = [fluid.layers.data(name=input_fields['names'][i], inputs = [fluid.data(name=input_fields['names'][i],
shape=input_fields['shapes'][i], shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i], dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))]
(src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = inputs (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = inputs
pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False)
bert = BertModel( bert = BertModel(
src_ids=src_ids, src_ids=src_ids,
...@@ -126,14 +126,14 @@ def create_model(bert_config): ...@@ -126,14 +126,14 @@ def create_model(bert_config):
next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output( next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output(
mask_label, mask_pos, labels) mask_label, mask_pos, labels)
return pyreader, next_sent_acc, mask_lm_loss, total_loss return data_loader, next_sent_acc, mask_lm_loss, total_loss
def predict_wrapper(args, def predict_wrapper(args,
exe, exe,
bert_config, bert_config,
test_prog=None, test_prog=None,
pyreader=None, data_loader=None,
fetch_list=None): fetch_list=None):
# Context to do validation. # Context to do validation.
data_path = args.test_set_dir if args.do_test else args.validation_set_dir data_path = args.test_set_dir if args.do_test else args.validation_set_dir
...@@ -148,7 +148,7 @@ def predict_wrapper(args, ...@@ -148,7 +148,7 @@ def predict_wrapper(args,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
is_test=True) is_test=True)
pyreader.decorate_batch_generator(data_reader.data_generator()) data_loader.set_batch_generator(data_reader.data_generator())
if args.do_test: if args.do_test:
assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \ assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \
...@@ -156,8 +156,8 @@ def predict_wrapper(args, ...@@ -156,8 +156,8 @@ def predict_wrapper(args,
init_pretraining_params(exe, args.init_checkpoint, test_prog) init_pretraining_params(exe, args.init_checkpoint, test_prog)
def predict(exe=exe, pyreader=pyreader): def predict(exe=exe, data_loader=data_loader):
pyreader.start() data_loader.start()
cost = 0 cost = 0
lm_cost = 0 lm_cost = 0
...@@ -176,7 +176,7 @@ def predict_wrapper(args, ...@@ -176,7 +176,7 @@ def predict_wrapper(args,
print("[test_set] steps: %d" % steps) print("[test_set] steps: %d" % steps)
except fluid.core.EOFException: except fluid.core.EOFException:
pyreader.reset() data_loader.reset()
break break
used_time = time.time() - time_begin used_time = time.time() - time_begin
...@@ -193,7 +193,7 @@ def test(args): ...@@ -193,7 +193,7 @@ def test(args):
test_startup = fluid.Program() test_startup = fluid.Program()
with fluid.program_guard(test_prog, test_startup): with fluid.program_guard(test_prog, test_startup):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
bert_config=bert_config) bert_config=bert_config)
test_prog = test_prog.clone(for_test=True) test_prog = test_prog.clone(for_test=True)
...@@ -207,7 +207,7 @@ def test(args): ...@@ -207,7 +207,7 @@ def test(args):
exe, exe,
bert_config, bert_config,
test_prog=test_prog, test_prog=test_prog,
pyreader=test_pyreader, data_loader=test_data_loader,
fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name]) fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name])
print("test begin") print("test begin")
...@@ -228,7 +228,7 @@ def train(args): ...@@ -228,7 +228,7 @@ def train(args):
startup_prog = fluid.Program() startup_prog = fluid.Program()
with fluid.program_guard(train_program, startup_prog): with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
bert_config=bert_config) bert_config=bert_config)
scheduled_lr, loss_scaling = optimization( scheduled_lr, loss_scaling = optimization(
loss=total_loss, loss=total_loss,
...@@ -250,7 +250,7 @@ def train(args): ...@@ -250,7 +250,7 @@ def train(args):
test_prog = fluid.Program() test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog): with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
bert_config=bert_config) bert_config=bert_config)
test_prog = test_prog.clone(for_test=True) test_prog = test_prog.clone(for_test=True)
...@@ -263,16 +263,6 @@ def train(args): ...@@ -263,16 +263,6 @@ def train(args):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
print("Device count %d" % dev_count) print("Device count %d" % dev_count)
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
nccl2_num_trainers = 1 nccl2_num_trainers = 1
nccl2_trainer_id = 0 nccl2_trainer_id = 0
...@@ -345,13 +335,13 @@ def train(args): ...@@ -345,13 +335,13 @@ def train(args):
exe, exe,
bert_config, bert_config,
test_prog=test_prog, test_prog=test_prog,
pyreader=test_pyreader, data_loader=test_data_loader,
fetch_list=[ fetch_list=[
next_sent_acc.name, mask_lm_loss.name, total_loss.name next_sent_acc.name, mask_lm_loss.name, total_loss.name
]) ])
train_pyreader.decorate_batch_generator(data_reader.data_generator()) train_data_loader.set_batch_generator(data_reader.data_generator())
train_pyreader.start() train_data_loader.start()
steps = 0 steps = 0
cost = [] cost = []
lm_cost = [] lm_cost = []
...@@ -402,7 +392,7 @@ def train(args): ...@@ -402,7 +392,7 @@ def train(args):
epoch, current_file_index, total_file, current_file = data_reader.get_progress( epoch, current_file_index, total_file, current_file = data_reader.get_progress(
) )
if args.verbose: if args.verbose:
verbose = "feed_queue size: %d, " %train_pyreader.queue.size() verbose = "feed_queue size: %d, " %train_data_loader.queue.size()
verbose += "current learning_rate: %f, " % np_lr[0] verbose += "current learning_rate: %f, " % np_lr[0]
if args.use_fp16: if args.use_fp16:
verbose += "loss scaling: %f" % np_scaling[0] verbose += "loss scaling: %f" % np_scaling[0]
...@@ -437,7 +427,7 @@ def train(args): ...@@ -437,7 +427,7 @@ def train(args):
np.mean(np.array(vali_acc) / vali_steps), vali_speed)) np.mean(np.array(vali_acc) / vali_steps), vali_speed))
except fluid.core.EOFException: except fluid.core.EOFException:
train_pyreader.reset() train_data_loader.reset()
break break
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -119,9 +119,10 @@ def create_master_params_grads(params_grads, main_prog, startup_prog, ...@@ -119,9 +119,10 @@ def create_master_params_grads(params_grads, main_prog, startup_prog,
def master_param_to_train_param(master_params_grads, params_grads, main_prog): def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads): for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx] train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
if train_p.name.find("layer_norm") > -1:
fluid.layers.assign(m_p_g[0], train_p)
else:
append_cast_op(m_p_g[0], train_p, main_prog) append_cast_op(m_p_g[0], train_p, main_prog)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册