提交 fc72dc60 编写于 作者: Z Zeyu Chen

migrate bert to latest interface

上级 cf4d67dd
...@@ -83,7 +83,10 @@ if __name__ == '__main__': ...@@ -83,7 +83,10 @@ if __name__ == '__main__':
batch_size=args.batch_size, batch_size=args.batch_size,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
in_tokens=args.in_tokens, finetune_strategy="bert_finetune",
with_memory_optimization=True,
in_tokens=True,
optimizer=None,
warmup_proportion=args.warmup_proportion) warmup_proportion=args.warmup_proportion)
processor = reader.ChnsenticorpProcessor( processor = reader.ChnsenticorpProcessor(
...@@ -123,4 +126,8 @@ if __name__ == '__main__': ...@@ -123,4 +126,8 @@ if __name__ == '__main__':
# Finetune and evaluate by PaddleHub's API # Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(cls_task, feed_list, processor, config) hub.finetune_and_eval(
task=cls_task,
data_processor=processor,
feed_list=feed_list,
config=config)
export CUDA_VISIBLE_DEVICES=6 export CUDA_VISIBLE_DEVICES=2
BERT_BASE_PATH="chinese_L-12_H-768_A-12" BERT_BASE_PATH="chinese_L-12_H-768_A-12"
TASK_NAME='chnsenticorp' TASK_NAME='chnsenticorp'
......
...@@ -30,6 +30,8 @@ FinetuneConfig = collections.namedtuple( ...@@ -30,6 +30,8 @@ FinetuneConfig = collections.namedtuple(
'weight_decay', # for bert 'weight_decay', # for bert
'warmup_proportion', # for bert 'warmup_proportion', # for bert
'in_tokens', # for bert 'in_tokens', # for bert
'strategy', 'finetune_strategy',
'with_memory_optimization' 'with_memory_optimization',
# learning rate scheduler
'optimizer'
]) ])
...@@ -23,17 +23,7 @@ import paddle ...@@ -23,17 +23,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle_hub.tools.logger import logger from paddle_hub.tools.logger import logger
from paddle_hub.finetune.optimization import bert_finetune
def optimizer_config_for_strategy(strategy, parameters, data_processor,
dev_count):
# basic configuration
learning_rate = 1e-4
optimizer = fluid.optimizer.Adam(learning_rate)
regularizer = fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)
return optimizer
def _finetune_model(task, def _finetune_model(task,
...@@ -51,12 +41,10 @@ def _finetune_model(task, ...@@ -51,12 +41,10 @@ def _finetune_model(task,
learning_rate = config.learning_rate learning_rate = config.learning_rate
use_cuda = config.use_cuda use_cuda = config.use_cuda
batch_size = config.batch_size batch_size = config.batch_size
strategy = config.strategy
with_memory_optimization = config.with_memory_optimization with_memory_optimization = config.with_memory_optimization
checkpoint_dir = config.checkpoint_dir checkpoint_dir = config.checkpoint_dir
with fluid.program_guard(main_program, startup_program): with fluid.program_guard(main_program, startup_program):
if use_cuda: if use_cuda:
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count() dev_count = fluid.core.get_cuda_device_count()
...@@ -65,17 +53,20 @@ def _finetune_model(task, ...@@ -65,17 +53,20 @@ def _finetune_model(task,
dev_count = int( dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
optimizer = optimizer_config_for_strategy(
strategy=strategy,
parameters=None,
data_processor=data_processor,
dev_count=dev_count)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
if config.finetune_strategy == "bert_finetune":
scheduled_lr = bert_finetune(task, main_program, data_processor,
config, dev_count)
elif config.optimizer == "adam":
optimzier = fluid.optimizer.Adam(learning_rate=config.learning_rate)
optimizer.minimize(loss) optimizer.minimize(loss)
#TODO: add more finetune strategy
if with_memory_optimization: if with_memory_optimization:
logger.info("Memory optimize start") logger.info("Memory optimization start...")
optimize_time_begin = time.time()
fluid.memory_optimize( fluid.memory_optimize(
input_program=fluid.default_main_program(), input_program=fluid.default_main_program(),
skip_opt_set=[ skip_opt_set=[
...@@ -83,7 +74,9 @@ def _finetune_model(task, ...@@ -83,7 +74,9 @@ def _finetune_model(task,
loss.name, loss.name,
accuracy.name accuracy.name
]) ])
logger.info("Memory optimize end") time_used = time.time() - optimize_time_begin
logger.info(
"Memory optimization done! Time elapsed %f sec" % time_used)
# initilize all parameters # initilize all parameters
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -91,13 +84,12 @@ def _finetune_model(task, ...@@ -91,13 +84,12 @@ def _finetune_model(task,
logger.info("Finetune start") logger.info("Finetune start")
train_time_begin = time.time() train_time_begin = time.time()
for index in range(epoch): for index in range(epoch):
train_reader = paddle.batch( train_reader = data_processor.data_generator(
data_processor.data_generator(phase='train'), batch_size=batch_size, phase='train')
batch_size=batch_size)
size = accuracy_sum = loss_sum = 0 size = accuracy_sum = loss_sum = 0
for batch in train_reader(): for batch in train_reader():
loss_v, accuracy_v = exe.run( loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch), feed=data_feeder.feed([batch]),
fetch_list=[loss.name, accuracy.name]) fetch_list=[loss.name, accuracy.name])
step += 1 step += 1
size += len(batch) size += len(batch)
...@@ -106,16 +98,16 @@ def _finetune_model(task, ...@@ -106,16 +98,16 @@ def _finetune_model(task,
if step % config.log_interval == 0: if step % config.log_interval == 0:
train_time_used = time.time() - train_time_begin train_time_used = time.time() - train_time_begin
perf = train_time_used / config.log_interval speed = config.log_interval / train_time_used
train_time_begin = time.time() train_time_begin = time.time()
logger.info( logger.info(
"step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" % "step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(step, loss_sum / size, accuracy_sum / size, perf)) (step, loss_sum / size, accuracy_sum / size, speed))
size = accuracy_sum = loss_sum = 0 size = accuracy_sum = loss_sum = 0
if step % config.save_ckpt_interval == 0: if step % config.save_ckpt_interval == 0:
model_save_dir = os.path.join( model_save_dir = os.path.join(checkpoint_dir,
checkpoint_dir, "model_parameters_in_step%d" % step) "step_%d" % step)
fluid.io.save_persistables(exe, dirname=model_save_dir) fluid.io.save_persistables(exe, dirname=model_save_dir)
if eval_model and step % config.eval_interval == 0: if eval_model and step % config.eval_interval == 0:
...@@ -123,7 +115,7 @@ def _finetune_model(task, ...@@ -123,7 +115,7 @@ def _finetune_model(task,
# eval before end # eval before end
if eval_model: if eval_model:
eval(task, data_processor, feed_list, config) eval(task, data_processor, feed_list, config)
logger.info("Finetune end") logger.info("Finetune finished")
def save_model_and_checkpoint(task, save_dir): def save_model_and_checkpoint(task, save_dir):
...@@ -150,22 +142,22 @@ def eval(task, data_processor, feed_list, config=None): ...@@ -150,22 +142,22 @@ def eval(task, data_processor, feed_list, config=None):
accuracy = task.variable("accuracy") accuracy = task.variable("accuracy")
use_cuda = config.use_cuda use_cuda = config.use_cuda
batch_size = config.batch_size batch_size = config.batch_size
logger.info("[Evaluation] start")
with fluid.program_guard(inference_program): with fluid.program_guard(inference_program):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
size = accuracy_sum = loss_sum = 0 size = accuracy_sum = loss_sum = 0
test_reader = paddle.batch( test_reader = data_processor.data_generator(
data_processor.data_generator(phase='test'), batch_size=batch_size) batch_size=batch_size, phase='test')
eval_time_begin = time.time() eval_time_begin = time.time()
for index, batch in enumerate(test_reader()): for index, batch in enumerate(test_reader()):
loss_v, accuracy_v, = exe.run( loss_v, accuracy_v, = exe.run(
feed=data_feeder.feed(batch), fetch_list=[loss, accuracy.name]) feed=data_feeder.feed([batch]),
fetch_list=[loss, accuracy.name])
size += len(batch) size += len(batch)
accuracy_sum += accuracy_v * len(batch) accuracy_sum += accuracy_v * len(batch)
loss_sum += loss_v * len(batch) loss_sum += loss_v * len(batch)
eval_time_used = time.time() - eval_time_begin eval_time_used = time.time() - eval_time_begin
perf = eval_time_used / index eval_speed = index / eval_time_used
logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" % logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(loss_sum / size, accuracy_sum / size, perf)) (loss_sum / size, accuracy_sum / size, eval_speed))
...@@ -19,7 +19,6 @@ import time ...@@ -19,7 +19,6 @@ import time
import numpy as np import numpy as np
import multiprocessing import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from .task import Task from .task import Task
__all__ = ['append_mlp_classifier'] __all__ = ['append_mlp_classifier']
......
...@@ -19,37 +19,27 @@ from __future__ import print_function ...@@ -19,37 +19,27 @@ from __future__ import print_function
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
"""
Finetune optimization strategy
"""
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): def bert_finetune(task, train_program, data_processor, config, dev_count):
""" Applies linear warmup of learning rate from 0 and decay to 0.""" # calculate wamrup step
with fluid.default_main_program()._lr_schedule_guard(): num_train_examples = data_processor.get_num_examples(phase='train')
lr = fluid.layers.tensor.create_global_var( max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
shape=[1], warmup_steps = int(max_train_steps * config.warmup_proportion)
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter() loss = task.variable("loss")
scheduled_lr = adam_weight_decay_optimizer_with_linear_warmup(
loss, warmup_steps, max_train_steps, config.learning_rate,
train_program, config.weight_decay)
with fluid.layers.control_flow.Switch() as switch: return scheduled_lr
with switch.case(global_step < warmup_steps):
warmup_lr = learning_rate * (global_step / warmup_steps)
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def bert_optimization(loss, def adam_weight_decay_optimizer_with_noam_decay(
loss,
warmup_steps, warmup_steps,
num_train_steps, num_train_steps,
learning_rate, learning_rate,
...@@ -104,3 +94,60 @@ def bert_optimization(loss, ...@@ -104,3 +94,60 @@ def bert_optimization(loss,
fluid.layers.assign(output=param, input=updated_param) fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr return scheduled_lr
def adam_weight_decay_optimizer_with_linear_warmup(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
weight_decay,
scheduler='noam_decay'):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
...@@ -19,9 +19,6 @@ import time ...@@ -19,9 +19,6 @@ import time
import numpy as np import numpy as np
import multiprocessing import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from paddle_hub.finetune.config import FinetuneConfig
class Task(object): class Task(object):
def __init__(self, task_type, graph_var_dict, main_program, def __init__(self, task_type, graph_var_dict, main_program,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册