提交 fc72dc60 编写于 作者: Z Zeyu Chen

migrate bert to latest interface

上级 cf4d67dd
......@@ -83,7 +83,10 @@ if __name__ == '__main__':
batch_size=args.batch_size,
max_seq_len=args.max_seq_len,
weight_decay=args.weight_decay,
in_tokens=args.in_tokens,
finetune_strategy="bert_finetune",
with_memory_optimization=True,
in_tokens=True,
optimizer=None,
warmup_proportion=args.warmup_proportion)
processor = reader.ChnsenticorpProcessor(
......@@ -123,4 +126,8 @@ if __name__ == '__main__':
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(cls_task, feed_list, processor, config)
hub.finetune_and_eval(
task=cls_task,
data_processor=processor,
feed_list=feed_list,
config=config)
export CUDA_VISIBLE_DEVICES=6
export CUDA_VISIBLE_DEVICES=2
BERT_BASE_PATH="chinese_L-12_H-768_A-12"
TASK_NAME='chnsenticorp'
......
......@@ -30,6 +30,8 @@ FinetuneConfig = collections.namedtuple(
'weight_decay', # for bert
'warmup_proportion', # for bert
'in_tokens', # for bert
'strategy',
'with_memory_optimization'
'finetune_strategy',
'with_memory_optimization',
# learning rate scheduler
'optimizer'
])
......@@ -23,17 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle_hub.tools.logger import logger
def optimizer_config_for_strategy(strategy, parameters, data_processor,
dev_count):
# basic configuration
learning_rate = 1e-4
optimizer = fluid.optimizer.Adam(learning_rate)
regularizer = fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)
return optimizer
from paddle_hub.finetune.optimization import bert_finetune
def _finetune_model(task,
......@@ -51,12 +41,10 @@ def _finetune_model(task,
learning_rate = config.learning_rate
use_cuda = config.use_cuda
batch_size = config.batch_size
strategy = config.strategy
with_memory_optimization = config.with_memory_optimization
checkpoint_dir = config.checkpoint_dir
with fluid.program_guard(main_program, startup_program):
if use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
......@@ -65,17 +53,20 @@ def _finetune_model(task,
dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
optimizer = optimizer_config_for_strategy(
strategy=strategy,
parameters=None,
data_processor=data_processor,
dev_count=dev_count)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place)
optimizer.minimize(loss)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
if config.finetune_strategy == "bert_finetune":
scheduled_lr = bert_finetune(task, main_program, data_processor,
config, dev_count)
elif config.optimizer == "adam":
optimzier = fluid.optimizer.Adam(learning_rate=config.learning_rate)
optimizer.minimize(loss)
#TODO: add more finetune strategy
if with_memory_optimization:
logger.info("Memory optimize start")
logger.info("Memory optimization start...")
optimize_time_begin = time.time()
fluid.memory_optimize(
input_program=fluid.default_main_program(),
skip_opt_set=[
......@@ -83,7 +74,9 @@ def _finetune_model(task,
loss.name,
accuracy.name
])
logger.info("Memory optimize end")
time_used = time.time() - optimize_time_begin
logger.info(
"Memory optimization done! Time elapsed %f sec" % time_used)
# initilize all parameters
exe.run(fluid.default_startup_program())
......@@ -91,13 +84,12 @@ def _finetune_model(task,
logger.info("Finetune start")
train_time_begin = time.time()
for index in range(epoch):
train_reader = paddle.batch(
data_processor.data_generator(phase='train'),
batch_size=batch_size)
train_reader = data_processor.data_generator(
batch_size=batch_size, phase='train')
size = accuracy_sum = loss_sum = 0
for batch in train_reader():
loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch),
feed=data_feeder.feed([batch]),
fetch_list=[loss.name, accuracy.name])
step += 1
size += len(batch)
......@@ -106,16 +98,16 @@ def _finetune_model(task,
if step % config.log_interval == 0:
train_time_used = time.time() - train_time_begin
perf = train_time_used / config.log_interval
speed = config.log_interval / train_time_used
train_time_begin = time.time()
logger.info(
"step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(step, loss_sum / size, accuracy_sum / size, perf))
(step, loss_sum / size, accuracy_sum / size, speed))
size = accuracy_sum = loss_sum = 0
if step % config.save_ckpt_interval == 0:
model_save_dir = os.path.join(
checkpoint_dir, "model_parameters_in_step%d" % step)
model_save_dir = os.path.join(checkpoint_dir,
"step_%d" % step)
fluid.io.save_persistables(exe, dirname=model_save_dir)
if eval_model and step % config.eval_interval == 0:
......@@ -123,7 +115,7 @@ def _finetune_model(task,
# eval before end
if eval_model:
eval(task, data_processor, feed_list, config)
logger.info("Finetune end")
logger.info("Finetune finished")
def save_model_and_checkpoint(task, save_dir):
......@@ -150,22 +142,22 @@ def eval(task, data_processor, feed_list, config=None):
accuracy = task.variable("accuracy")
use_cuda = config.use_cuda
batch_size = config.batch_size
logger.info("[Evaluation] start")
with fluid.program_guard(inference_program):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place)
size = accuracy_sum = loss_sum = 0
test_reader = paddle.batch(
data_processor.data_generator(phase='test'), batch_size=batch_size)
test_reader = data_processor.data_generator(
batch_size=batch_size, phase='test')
eval_time_begin = time.time()
for index, batch in enumerate(test_reader()):
loss_v, accuracy_v, = exe.run(
feed=data_feeder.feed(batch), fetch_list=[loss, accuracy.name])
feed=data_feeder.feed([batch]),
fetch_list=[loss, accuracy.name])
size += len(batch)
accuracy_sum += accuracy_v * len(batch)
loss_sum += loss_v * len(batch)
eval_time_used = time.time() - eval_time_begin
perf = eval_time_used / index
eval_speed = index / eval_time_used
logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(loss_sum / size, accuracy_sum / size, perf))
(loss_sum / size, accuracy_sum / size, eval_speed))
......@@ -19,7 +19,6 @@ import time
import numpy as np
import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from .task import Task
__all__ = ['append_mlp_classifier']
......
......@@ -19,43 +19,90 @@ from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
"""
Finetune optimization strategy
"""
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < warmup_steps):
warmup_lr = learning_rate * (global_step / warmup_steps)
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def bert_optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
weight_decay,
scheduler='linear_warmup_decay'):
def bert_finetune(task, train_program, data_processor, config, dev_count):
# calculate wamrup step
num_train_examples = data_processor.get_num_examples(phase='train')
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * config.warmup_proportion)
loss = task.variable("loss")
scheduled_lr = adam_weight_decay_optimizer_with_linear_warmup(
loss, warmup_steps, max_train_steps, config.learning_rate,
train_program, config.weight_decay)
return scheduled_lr
def adam_weight_decay_optimizer_with_noam_decay(
loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
weight_decay,
scheduler='linear_warmup_decay'):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
def adam_weight_decay_optimizer_with_linear_warmup(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
weight_decay,
scheduler='noam_decay'):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
......
......@@ -19,9 +19,6 @@ import time
import numpy as np
import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from paddle_hub.finetune.config import FinetuneConfig
class Task(object):
def __init__(self, task_type, graph_var_dict, main_program,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册