提交 377f2496 编写于 作者: Z Zeyu Chen

remove in_tokens arguments of berts

上级 57dbe135
......@@ -70,8 +70,21 @@ run_type_g.add_arg("use_cuda", bool, True, "If set, use G
args = parser.parse_args()
# yapf: enable.
def test_hub_api(args, config):
if __name__ == '__main__':
print_arguments(args)
config = FinetuneConfig(
log_interval=10,
eval_interval=100,
save_ckpt_interval=200,
use_cuda=True,
checkpoint_dir="./bert_cls_ckpt",
learning_rate=args.learning_rate,
num_epoch=args.epoch,
batch_size=args.batch_size,
max_seq_len=args.max_seq_len,
weight_decay=args.weight_decay,
in_tokens=args.in_tokens,
warmup_proportion=args.warmup_proportion)
processor = reader.ChnsenticorpProcessor(
data_dir=args.data_dir,
......@@ -86,38 +99,28 @@ def test_hub_api(args, config):
# loading paddlehub BERT
module = hub.Module(module_dir="./chinese_L-12_H-768_A-12.hub_module")
# bert's input tensor, output tensor and forward graph
# If you want to fine-tune the pretrain model parameter, please set
# trainable to True
input_dict, output_dict, train_program = module.context(
sign_name="pooled_output", trainable=True)
startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
with fluid.program_guard(train_program):
label = fluid.layers.data(name="label", shape=[1], dtype='int64')
pooled_output = output_dict["pooled_output"]
# setup feed list for data feeder
# Setup feed list for data feeder
# Must feed all the tensor of bert's module need
feed_list = [
input_dict["src_ids"].name, input_dict["pos_ids"].name,
input_dict["sent_ids"].name, input_dict["input_mask"].name,
label.name
]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.append_mlp_classifier(
pooled_output, label, num_classes=num_labels)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(cls_task, feed_list, processor, config)
if __name__ == '__main__':
print_arguments(args)
config = FinetuneConfig(
stat_interval=args.skip_steps,
eval_interval=args.validation_steps,
use_cuda=True,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
in_tokens=args.in_tokens,
epoch=args.epoch,
batch_size=args.batch_size,
max_seq_len=args.max_seq_len,
warmup_proportion=args.warmup_proportion)
test_hub_api(args, config)
......@@ -7,8 +7,8 @@ DATA_PATH=chnsenticorp_data
rm -rf $CKPT_PATH
python -u finetune_with_hub.py \
--use_cuda true \
--batch_size 4096 \
--in_tokens true \
--batch_size 32 \
--in_tokens false \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--weight_decay 0.01 \
......
......@@ -14,8 +14,20 @@
import collections
FinetuneConfig = collections.namedtuple('FinetuneConfig', [
'stat_interval', 'eval_interval', 'use_cuda', 'learning_rate',
'weight_decay', 'in_tokens', 'epoch', 'batch_size', 'max_seq_len',
'warmup_proportion'
])
FinetuneConfig = collections.namedtuple(
'FinetuneConfig',
[
'log_interval', # print training log every n step
'eval_interval', # evalution the model every n steps
'save_ckpt_interval', # save the model checkpoint every n steps
'use_cuda', # use gpu or not
'learning_rate',
'checkpoint_dir', # model checkpoint directory
'num_epoch', # number of finetune epoch
'batch_size',
# for bert parameter
'max_seq_len', # for bert
'weight_decay', # for bert
'warmup_proportion', # for bert
'in_tokens' # for bert
])
......@@ -30,22 +30,7 @@ def finetune_and_eval(task, feed_list, data_processor, config=None):
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
# data generator
data_generator = {
'train':
data_processor.data_generator(
batch_size=config.batch_size,
phase='train',
epoch=config.epoch,
shuffle=False),
'test':
data_processor.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False),
'dev':
data_processor.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
}
exe = fluid.Executor(place)
# hub.finetune_and_eval start here
#TODO: to simplify
......@@ -56,10 +41,10 @@ def finetune_and_eval(task, feed_list, data_processor, config=None):
num_train_examples = data_processor.get_num_examples(phase='train')
if config.in_tokens:
max_train_steps = config.epoch * num_train_examples // (
max_train_steps = config.num_epoch * num_train_examples // (
config.batch_size // config.max_seq_len) // dev_count
else:
max_train_steps = config.epoch * num_train_examples // config.batch_size // dev_count
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * config.warmup_proportion)
......@@ -83,73 +68,80 @@ def finetune_and_eval(task, feed_list, data_processor, config=None):
num_example.name
])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
# Traning block
# prepare training dataset
train_data_generator = data_generator['train']
total_loss, total_acc, total_num_example = [], [], []
step = 0
time_begin = time.time()
train_time_used = 0.0
for example in train_data_generator():
step += 1
train_time_begin = time.time()
np_loss, np_acc, np_num_example = exe.run(
program=train_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
train_time_used += time.time() - train_time_begin
for epoch in range(1, config.num_epoch + 1):
print("Epoch {}".format(epoch))
train_data_generator = data_processor.data_generator(
batch_size=config.batch_size, phase='train', shuffle=False)
for example in train_data_generator():
step += 1
train_time_begin = time.time()
np_loss, np_acc, np_num_example = exe.run(
program=train_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
train_time_used += time.time() - train_time_begin
# Statistic Block
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
if step % config.log_interval == 0:
# get training progress
accum_num_example = np.sum(total_num_example)
print(
"step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
step,
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example,
config.log_interval / train_time_used))
# reset statistic variables
total_loss, total_acc, total_num_example = [], [], []
train_time_used = 0.0
# Evaluation block
if step % config.eval_interval == 0:
test_data_generator = data_processor.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False)
dev_data_generator = data_processor.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
evaluate(task, test_program, exe, feeder, dev_data_generator)
evaluate(task, test_program, exe, feeder, test_data_generator)
# Save model checkpoint
if step % config.save_ckpt_interval == 0:
save_checkpoint(exe, train_program, step, config.checkpoint_dir)
# finish final evaluation on testset
test_data_generator = data_processor.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False)
evaluate(task, test_program, exe, feeder, test_data_generator)
def save_checkpoint(exe, train_program, step, ckpt_dir):
#TODO: add global step variable for restore checkpoint like tensorflow
ckpt_step_dir = os.path.join(ckpt_dir, "step_{}".format(step))
fluid.io.save_persistables(exe, ckpt_step_dir, train_program)
def evaluate(task, test_program, exe, feeder, data_generator):
loss = task.variable("loss")
probs = task.variable("probs")
accuracy = task.variable("accuracy")
num_example = task.variable("num_example")
# Statistic Block
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
if step % config.stat_interval == 0:
# get training progress
accum_num_example = np.sum(total_num_example)
print("step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
step,
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example,
config.stat_interval / train_time_used))
# reset statistic variables
total_loss, total_acc, total_num_example = [], [], []
train_time_used = 0.0
# Evaluation block
if step % config.eval_interval == 0:
evaluate(test_program, exe, data_generator)
if step % config.eval_interval == 0:
# Final Test Block
total_loss, total_acc, total_num_example = [], [], []
test_data_generator = data_generator['test']
for example in test_data_generator():
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
accum_num_example = np.sum(total_num_example)
print("[Final Test] loss={:.5f} acc={:.5f}".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example))
def evaluate(test_program, exe, feeder, data_generator):
print("Evaluation start")
total_loss, total_acc, total_num_example = [], [], []
dev_data_generator = data_generator['dev']
eval_step = 0
eval_time_begin = time.time()
for example in dev_data_generator():
for example in data_generator():
eval_step += 1
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
......@@ -160,6 +152,6 @@ def evaluate(test_program, exe, feeder, data_generator):
total_num_example.extend(np_num_example)
eval_time_used = time.time() - eval_time_begin
accum_num_example = np.sum(total_num_example)
print("[Evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
print("[evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example, eval_step / eval_time_used))
......@@ -49,95 +49,6 @@ def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
return lr
def optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False,
loss_scaling=1.0):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
if use_fp16 and loss_scaling > 1.0:
clip_norm_thres *= loss_scaling
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
if use_fp16:
param_grads = optimizer.backward(loss)
master_param_grads = create_master_params_grads(
param_grads, train_program, startup_prog, loss_scaling)
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
optimizer.apply_gradients(master_param_grads)
if weight_decay > 0:
for param, grad in master_param_grads:
if exclude_from_weight_decay(param.name.rstrip(".master")):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
def bert_optimization(loss,
warmup_steps,
num_train_steps,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册