提交 3831c8ba 编写于 作者: Z Zeyu Chen

polish finetune.py and finetune_with_hub for bert classification

上级 d7cfc311
......@@ -73,9 +73,9 @@ if __name__ == '__main__':
warmup_proportion=args.warmup_proportion)
# loading paddlehub BERT
module = hub.Module(
module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
# module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
# module = hub.Module(
# module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
processor = reader.BERTClassifyReader(
data_dir=args.data_dir,
......
......@@ -51,6 +51,10 @@ class DataProcessor(object):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_val_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the val set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()
......@@ -109,9 +113,9 @@ class DataProcessor(object):
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'validate', 'test']:
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'validate, 'test']."
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
......@@ -132,9 +136,9 @@ class DataProcessor(object):
if phase == 'train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['train'] = len(examples)
elif phase == 'validate':
elif phase == 'val' or phase == 'dev':
examples = self.get_dev_examples(self.data_dir)
self.num_examples['validate'] = len(examples)
self.num_examples['dev'] = len(examples)
elif phase == 'test':
examples = self.get_test_examples(self.data_dir)
self.num_examples['test'] = len(examples)
......
......@@ -2,7 +2,7 @@ export CUDA_VISIBLE_DEVICES=5
DATA_PATH=./chnsenticorp_data
rm -rf $CKPT_PATH
rm -rf ./bert_cls_ckpt
python -u finetune_with_hub.py \
--batch_size 32 \
--in_tokens false \
......
......@@ -30,34 +30,35 @@ from paddle_hub.finetune.checkpoint import load_checkpoint, save_checkpoint
CKPT_FILE = "ckpt.meta"
def _finetune_model(task,
data_processor,
feed_list,
config=None,
eval_model=False):
def _get_running_device_info(config):
if config.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return place, dev_count
def _finetune_model(task, data_processor, feed_list, config=None,
do_eval=False):
main_program = task.main_program()
startup_program = task.startup_program()
loss = task.variable("loss")
accuracy = task.variable("accuracy")
epoch = config.num_epoch
num_epoch = config.num_epoch
batch_size = config.batch_size
learning_rate = config.learning_rate
use_cuda = config.use_cuda
with_memory_optimization = config.with_memory_optimization
checkpoint_dir = config.checkpoint_dir
checkpoint_path = os.path.join(checkpoint_dir, CKPT_FILE)
log_writter = LogWriter(
os.path.join(checkpoint_dir, "vdllog"), sync_cycle=10)
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
if use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
......@@ -90,14 +91,17 @@ def _finetune_model(task,
(lower_mem, upper_mem, unit)),
# initilize
if os.path.exists(checkpoint_path):
last_epoch, step, last_model_dir = load_checkpoint(checkpoint_path)
last_epoch, global_step, last_model_dir = load_checkpoint(
checkpoint_path)
fluid.io.load_persistables(exe, last_model_dir)
else:
exe.run(fluid.default_startup_program())
step = 0
last_epoch = 0
best_eval_acc = 0
logger.info("Finetune start")
global_step = 0
last_epoch = 1
best_eval_acc = 0.0
train_time_used = 0
logger.info("PaddleHub finetune start")
# add visualdl scalar
with log_writter.mode("train") as logw:
......@@ -107,84 +111,90 @@ def _finetune_model(task,
eval_loss_scalar = logw.scalar(tag="loss[evaluate]")
eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")
train_time_begin = time.time()
for index in range(last_epoch, epoch):
for epoch in range(last_epoch, num_epoch + 1):
train_reader = data_processor.data_generator(
batch_size=batch_size, phase='train')
size = accuracy_sum = loss_sum = 0
num_trained_examples = acc_sum = loss_sum = 0
for batch in train_reader():
num_batch_examples = len(batch)
train_time_begin = time.time()
loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch),
fetch_list=[loss.name, accuracy.name])
step += 1
size += len(batch)
accuracy_sum += accuracy_v * len(batch)
loss_sum += loss_v * len(batch)
# print log
if step % config.log_interval == 0:
train_time_used = time.time() - train_time_begin
train_time_used += time.time() - train_time_begin
global_step += 1
num_trained_examples += num_batch_examples
acc_sum += accuracy_v * num_batch_examples
loss_sum += loss_v * num_batch_examples
# log fintune status
if global_step % config.log_interval == 0:
avg_loss = loss_sum / num_trained_examples
avg_acc = acc_sum / num_trained_examples
speed = config.log_interval / train_time_used
train_time_begin = time.time()
logger.info(
"step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(step, loss_sum / size, accuracy_sum / size, speed))
logger.info("step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(global_step, avg_loss, avg_acc, speed))
# record visualdl log
record_step = step
train_loss_scalar.add_record(record_step, loss_sum / size)
train_acc_scalar.add_record(record_step,
accuracy_sum / size)
size = accuracy_sum = loss_sum = 0
if step % config.save_ckpt_interval == 0:
model_save_dir = os.path.join(checkpoint_dir,
"model_in_step_%d" % step)
fluid.io.save_persistables(exe, dirname=model_save_dir)
train_loss_scalar.add_record(global_step, avg_loss)
train_acc_scalar.add_record(global_step, avg_acc)
train_time_used = 0
num_trained_examples = acc_sum = loss_sum = 0
if global_step % config.save_ckpt_interval == 0:
model_saved_dir = os.path.join(
checkpoint_dir, "model_in_step_%d" % global_step)
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed,
# it can't restore dataset training status
save_checkpoint(
checkpoint_path,
last_epoch=index,
last_step=step,
last_model_dir=model_save_dir)
last_epoch=epoch,
last_step=global_step,
last_model_dir=model_saved_dir)
if eval_model and step % config.eval_interval == 0:
if do_eval and global_step % config.eval_interval == 0:
eval_loss, eval_acc, eval_perf = evaluate(
task,
data_processor,
feed_list,
phase="validate",
phase="val",
config=config)
record_step = step
eval_loss_scalar.add_record(record_step, eval_loss)
eval_acc_scalar.add_record(record_step, eval_acc)
eval_loss_scalar.add_record(global_step, eval_loss)
eval_acc_scalar.add_record(global_step, eval_acc)
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
model_save_dir = os.path.join(checkpoint_dir,
"model_best")
fluid.io.save_persistables(exe, dirname=model_save_dir)
model_saved_dir = os.path.join(checkpoint_dir,
"best_model")
logger.info(
"best model saved to %s [best accuracy=%.5f]" %
(model_saved_dir, best_eval_acc))
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# update model and checkpoint
model_save_dir = os.path.join(checkpoint_dir, "model_latest")
fluid.io.save_persistables(exe, dirname=model_save_dir)
model_saved_dir = os.path.join(checkpoint_dir, "final_model")
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
save_checkpoint(
checkpoint_path,
last_epoch=epoch + 1,
last_step=step,
last_model_dir=model_save_dir)
# eval before end
if eval_model:
last_epoch=num_epoch + 1,
last_step=global_step,
last_model_dir=model_saved_dir)
if do_eval:
evaluate(
task, data_processor, feed_list, phase="test", config=config)
logger.info("Finetune finished")
logger.info("PaddleHub finetune finished.")
def finetune_and_eval(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, eval_model=True)
_finetune_model(task, data_processor, feed_list, config, do_eval=True)
def finetune(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, eval_model=False)
_finetune_model(task, data_processor, feed_list, config, do_eval=False)
def evaluate(task, data_processor, feed_list, phase="test", config=None):
......@@ -192,25 +202,31 @@ def evaluate(task, data_processor, feed_list, phase="test", config=None):
main_program = task.main_program()
loss = task.variable("loss")
accuracy = task.variable("accuracy")
use_cuda = config.use_cuda
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
exe = fluid.Executor(place=place)
with fluid.program_guard(inference_program):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place)
size = accuracy_sum = loss_sum = 0
num_eval_examples = acc_sum = loss_sum = 0
test_reader = data_processor.data_generator(
batch_size=batch_size, phase=phase)
eval_time_begin = time.time()
for index, batch in enumerate(test_reader()):
loss_v, accuracy_v, = exe.run(
feed=data_feeder.feed(batch), fetch_list=[loss, accuracy.name])
size += len(batch)
accuracy_sum += accuracy_v * len(batch)
loss_sum += loss_v * len(batch)
eval_step = 0
for batch in test_reader():
num_batch_examples = len(batch)
eval_step += 1
loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch),
fetch_list=[loss.name, accuracy.name])
num_eval_examples += num_batch_examples
acc_sum += accuracy_v * num_batch_examples
loss_sum += loss_v * num_batch_examples
eval_time_used = time.time() - eval_time_begin
eval_speed = index / eval_time_used
logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(loss_sum / size, accuracy_sum / size, eval_speed))
return loss_sum / size, accuracy_sum / size, eval_speed
avg_loss = loss_sum / num_eval_examples
avg_acc = acc_sum / num_eval_examples
eval_speed = eval_step / eval_time_used
logger.info("[evaluation on %s set] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(phase, avg_loss, avg_acc, eval_speed))
return avg_loss, avg_acc, eval_speed
......@@ -444,8 +444,13 @@ class Module:
if key:
fetch_dict[key] = program.global_block().var(var.name)
# record num parameters loaded by paddlehub
num_param_loaded = 0
for param in program.global_block().iter_parameters():
logger.debug("%s %s" % (param.name, param.optimize_attr))
num_param_loaded += 1
# logger.debug("%s %s" % (param.name, param.optimize_attr))
logger.info(
"%d pretrained paramaters loaded by PaddleHub" % num_param_loaded)
return feed_dict, fetch_dict, program
......@@ -490,9 +495,10 @@ class Module:
# create module pb
module_desc = module_desc_pb2.ModuleDesc()
logger.info("hub version is %s" % version.hub_version)
logger.info("module proto version is %s" % version.module_proto_version)
logger.info("paddle version is %s" % paddle.__version__)
logger.info("PaddleHub version = %s" % version.hub_version)
logger.info("PaddleHub Module proto version = %s" %
version.module_proto_version)
logger.info("Paddle version = %s" % paddle.__version__)
feeded_var_names = [
input.name for key, sign in self.signatures.items()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册