提交 3831c8ba 编写于 作者: Z Zeyu Chen

polish finetune.py and finetune_with_hub for bert classification

上级 d7cfc311
...@@ -73,9 +73,9 @@ if __name__ == '__main__': ...@@ -73,9 +73,9 @@ if __name__ == '__main__':
warmup_proportion=args.warmup_proportion) warmup_proportion=args.warmup_proportion)
# loading paddlehub BERT # loading paddlehub BERT
module = hub.Module( # module = hub.Module(
module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module") # module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
# module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module") module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
processor = reader.BERTClassifyReader( processor = reader.BERTClassifyReader(
data_dir=args.data_dir, data_dir=args.data_dir,
......
...@@ -51,6 +51,10 @@ class DataProcessor(object): ...@@ -51,6 +51,10 @@ class DataProcessor(object):
"""Gets a collection of `InputExample`s for the dev set.""" """Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError() raise NotImplementedError()
def get_val_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the val set."""
raise NotImplementedError()
def get_test_examples(self, data_dir): def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction.""" """Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError() raise NotImplementedError()
...@@ -109,9 +113,9 @@ class DataProcessor(object): ...@@ -109,9 +113,9 @@ class DataProcessor(object):
def get_num_examples(self, phase): def get_num_examples(self, phase):
"""Get number of examples for train, dev or test.""" """Get number of examples for train, dev or test."""
if phase not in ['train', 'validate', 'test']: if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError( raise ValueError(
"Unknown phase, which should be in ['train', 'validate, 'test']." "Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
) )
return self.num_examples[phase] return self.num_examples[phase]
...@@ -132,9 +136,9 @@ class DataProcessor(object): ...@@ -132,9 +136,9 @@ class DataProcessor(object):
if phase == 'train': if phase == 'train':
examples = self.get_train_examples(self.data_dir) examples = self.get_train_examples(self.data_dir)
self.num_examples['train'] = len(examples) self.num_examples['train'] = len(examples)
elif phase == 'validate': elif phase == 'val' or phase == 'dev':
examples = self.get_dev_examples(self.data_dir) examples = self.get_dev_examples(self.data_dir)
self.num_examples['validate'] = len(examples) self.num_examples['dev'] = len(examples)
elif phase == 'test': elif phase == 'test':
examples = self.get_test_examples(self.data_dir) examples = self.get_test_examples(self.data_dir)
self.num_examples['test'] = len(examples) self.num_examples['test'] = len(examples)
......
...@@ -2,7 +2,7 @@ export CUDA_VISIBLE_DEVICES=5 ...@@ -2,7 +2,7 @@ export CUDA_VISIBLE_DEVICES=5
DATA_PATH=./chnsenticorp_data DATA_PATH=./chnsenticorp_data
rm -rf $CKPT_PATH rm -rf ./bert_cls_ckpt
python -u finetune_with_hub.py \ python -u finetune_with_hub.py \
--batch_size 32 \ --batch_size 32 \
--in_tokens false \ --in_tokens false \
......
...@@ -30,34 +30,35 @@ from paddle_hub.finetune.checkpoint import load_checkpoint, save_checkpoint ...@@ -30,34 +30,35 @@ from paddle_hub.finetune.checkpoint import load_checkpoint, save_checkpoint
CKPT_FILE = "ckpt.meta" CKPT_FILE = "ckpt.meta"
def _finetune_model(task, def _get_running_device_info(config):
data_processor, if config.use_cuda:
feed_list, place = fluid.CUDAPlace(0)
config=None, dev_count = fluid.core.get_cuda_device_count()
eval_model=False): else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return place, dev_count
def _finetune_model(task, data_processor, feed_list, config=None,
do_eval=False):
main_program = task.main_program() main_program = task.main_program()
startup_program = task.startup_program() startup_program = task.startup_program()
loss = task.variable("loss") loss = task.variable("loss")
accuracy = task.variable("accuracy") accuracy = task.variable("accuracy")
epoch = config.num_epoch num_epoch = config.num_epoch
batch_size = config.batch_size batch_size = config.batch_size
learning_rate = config.learning_rate learning_rate = config.learning_rate
use_cuda = config.use_cuda
with_memory_optimization = config.with_memory_optimization with_memory_optimization = config.with_memory_optimization
checkpoint_dir = config.checkpoint_dir checkpoint_dir = config.checkpoint_dir
checkpoint_path = os.path.join(checkpoint_dir, CKPT_FILE) checkpoint_path = os.path.join(checkpoint_dir, CKPT_FILE)
log_writter = LogWriter( log_writter = LogWriter(
os.path.join(checkpoint_dir, "vdllog"), sync_cycle=10) os.path.join(checkpoint_dir, "vdllog"), sync_cycle=10)
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program): with fluid.program_guard(main_program, startup_program):
if use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place=place) exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
...@@ -90,14 +91,17 @@ def _finetune_model(task, ...@@ -90,14 +91,17 @@ def _finetune_model(task,
(lower_mem, upper_mem, unit)), (lower_mem, upper_mem, unit)),
# initilize # initilize
if os.path.exists(checkpoint_path): if os.path.exists(checkpoint_path):
last_epoch, step, last_model_dir = load_checkpoint(checkpoint_path) last_epoch, global_step, last_model_dir = load_checkpoint(
checkpoint_path)
fluid.io.load_persistables(exe, last_model_dir) fluid.io.load_persistables(exe, last_model_dir)
else: else:
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
step = 0 global_step = 0
last_epoch = 0 last_epoch = 1
best_eval_acc = 0
logger.info("Finetune start") best_eval_acc = 0.0
train_time_used = 0
logger.info("PaddleHub finetune start")
# add visualdl scalar # add visualdl scalar
with log_writter.mode("train") as logw: with log_writter.mode("train") as logw:
...@@ -107,84 +111,90 @@ def _finetune_model(task, ...@@ -107,84 +111,90 @@ def _finetune_model(task,
eval_loss_scalar = logw.scalar(tag="loss[evaluate]") eval_loss_scalar = logw.scalar(tag="loss[evaluate]")
eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]") eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")
train_time_begin = time.time() for epoch in range(last_epoch, num_epoch + 1):
for index in range(last_epoch, epoch):
train_reader = data_processor.data_generator( train_reader = data_processor.data_generator(
batch_size=batch_size, phase='train') batch_size=batch_size, phase='train')
size = accuracy_sum = loss_sum = 0 num_trained_examples = acc_sum = loss_sum = 0
for batch in train_reader(): for batch in train_reader():
num_batch_examples = len(batch)
train_time_begin = time.time()
loss_v, accuracy_v = exe.run( loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch), feed=data_feeder.feed(batch),
fetch_list=[loss.name, accuracy.name]) fetch_list=[loss.name, accuracy.name])
step += 1 train_time_used += time.time() - train_time_begin
size += len(batch) global_step += 1
accuracy_sum += accuracy_v * len(batch) num_trained_examples += num_batch_examples
loss_sum += loss_v * len(batch) acc_sum += accuracy_v * num_batch_examples
loss_sum += loss_v * num_batch_examples
# print log
if step % config.log_interval == 0: # log fintune status
train_time_used = time.time() - train_time_begin if global_step % config.log_interval == 0:
avg_loss = loss_sum / num_trained_examples
avg_acc = acc_sum / num_trained_examples
speed = config.log_interval / train_time_used speed = config.log_interval / train_time_used
train_time_begin = time.time() logger.info("step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
logger.info( (global_step, avg_loss, avg_acc, speed))
"step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(step, loss_sum / size, accuracy_sum / size, speed))
# record visualdl log # record visualdl log
record_step = step train_loss_scalar.add_record(global_step, avg_loss)
train_loss_scalar.add_record(record_step, loss_sum / size) train_acc_scalar.add_record(global_step, avg_acc)
train_acc_scalar.add_record(record_step,
accuracy_sum / size) train_time_used = 0
num_trained_examples = acc_sum = loss_sum = 0
size = accuracy_sum = loss_sum = 0
if global_step % config.save_ckpt_interval == 0:
if step % config.save_ckpt_interval == 0: model_saved_dir = os.path.join(
model_save_dir = os.path.join(checkpoint_dir, checkpoint_dir, "model_in_step_%d" % global_step)
"model_in_step_%d" % step) fluid.io.save_persistables(exe, dirname=model_saved_dir)
fluid.io.save_persistables(exe, dirname=model_save_dir) # NOTE: current saved checkpoint machanism is not completed,
# it can't restore dataset training status
save_checkpoint( save_checkpoint(
checkpoint_path, checkpoint_path,
last_epoch=index, last_epoch=epoch,
last_step=step, last_step=global_step,
last_model_dir=model_save_dir) last_model_dir=model_saved_dir)
if eval_model and step % config.eval_interval == 0: if do_eval and global_step % config.eval_interval == 0:
eval_loss, eval_acc, eval_perf = evaluate( eval_loss, eval_acc, eval_perf = evaluate(
task, task,
data_processor, data_processor,
feed_list, feed_list,
phase="validate", phase="val",
config=config) config=config)
record_step = step eval_loss_scalar.add_record(global_step, eval_loss)
eval_loss_scalar.add_record(record_step, eval_loss) eval_acc_scalar.add_record(global_step, eval_acc)
eval_acc_scalar.add_record(record_step, eval_acc)
if eval_acc > best_eval_acc: if eval_acc > best_eval_acc:
best_eval_acc = eval_acc best_eval_acc = eval_acc
model_save_dir = os.path.join(checkpoint_dir, model_saved_dir = os.path.join(checkpoint_dir,
"model_best") "best_model")
fluid.io.save_persistables(exe, dirname=model_save_dir) logger.info(
"best model saved to %s [best accuracy=%.5f]" %
(model_saved_dir, best_eval_acc))
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# update model and checkpoint # update model and checkpoint
model_save_dir = os.path.join(checkpoint_dir, "model_latest") model_saved_dir = os.path.join(checkpoint_dir, "final_model")
fluid.io.save_persistables(exe, dirname=model_save_dir) fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
save_checkpoint( save_checkpoint(
checkpoint_path, checkpoint_path,
last_epoch=epoch + 1, last_epoch=num_epoch + 1,
last_step=step, last_step=global_step,
last_model_dir=model_save_dir) last_model_dir=model_saved_dir)
# eval before end
if eval_model: if do_eval:
evaluate( evaluate(
task, data_processor, feed_list, phase="test", config=config) task, data_processor, feed_list, phase="test", config=config)
logger.info("Finetune finished") logger.info("PaddleHub finetune finished.")
def finetune_and_eval(task, data_processor, feed_list, config=None): def finetune_and_eval(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, eval_model=True) _finetune_model(task, data_processor, feed_list, config, do_eval=True)
def finetune(task, data_processor, feed_list, config=None): def finetune(task, data_processor, feed_list, config=None):
_finetune_model(task, data_processor, feed_list, config, eval_model=False) _finetune_model(task, data_processor, feed_list, config, do_eval=False)
def evaluate(task, data_processor, feed_list, phase="test", config=None): def evaluate(task, data_processor, feed_list, phase="test", config=None):
...@@ -192,25 +202,31 @@ def evaluate(task, data_processor, feed_list, phase="test", config=None): ...@@ -192,25 +202,31 @@ def evaluate(task, data_processor, feed_list, phase="test", config=None):
main_program = task.main_program() main_program = task.main_program()
loss = task.variable("loss") loss = task.variable("loss")
accuracy = task.variable("accuracy") accuracy = task.variable("accuracy")
use_cuda = config.use_cuda
batch_size = config.batch_size batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
exe = fluid.Executor(place=place)
with fluid.program_guard(inference_program): with fluid.program_guard(inference_program):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place) data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe = fluid.Executor(place=place) num_eval_examples = acc_sum = loss_sum = 0
size = accuracy_sum = loss_sum = 0
test_reader = data_processor.data_generator( test_reader = data_processor.data_generator(
batch_size=batch_size, phase=phase) batch_size=batch_size, phase=phase)
eval_time_begin = time.time() eval_time_begin = time.time()
for index, batch in enumerate(test_reader()): eval_step = 0
loss_v, accuracy_v, = exe.run( for batch in test_reader():
feed=data_feeder.feed(batch), fetch_list=[loss, accuracy.name]) num_batch_examples = len(batch)
size += len(batch) eval_step += 1
accuracy_sum += accuracy_v * len(batch) loss_v, accuracy_v = exe.run(
loss_sum += loss_v * len(batch) feed=data_feeder.feed(batch),
fetch_list=[loss.name, accuracy.name])
num_eval_examples += num_batch_examples
acc_sum += accuracy_v * num_batch_examples
loss_sum += loss_v * num_batch_examples
eval_time_used = time.time() - eval_time_begin eval_time_used = time.time() - eval_time_begin
eval_speed = index / eval_time_used
logger.info("[Evaluation] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(loss_sum / size, accuracy_sum / size, eval_speed))
return loss_sum / size, accuracy_sum / size, eval_speed avg_loss = loss_sum / num_eval_examples
avg_acc = acc_sum / num_eval_examples
eval_speed = eval_step / eval_time_used
logger.info("[evaluation on %s set] loss=%.5f acc=%.5f [step/sec: %.2f]" %
(phase, avg_loss, avg_acc, eval_speed))
return avg_loss, avg_acc, eval_speed
...@@ -444,8 +444,13 @@ class Module: ...@@ -444,8 +444,13 @@ class Module:
if key: if key:
fetch_dict[key] = program.global_block().var(var.name) fetch_dict[key] = program.global_block().var(var.name)
# record num parameters loaded by paddlehub
num_param_loaded = 0
for param in program.global_block().iter_parameters(): for param in program.global_block().iter_parameters():
logger.debug("%s %s" % (param.name, param.optimize_attr)) num_param_loaded += 1
# logger.debug("%s %s" % (param.name, param.optimize_attr))
logger.info(
"%d pretrained paramaters loaded by PaddleHub" % num_param_loaded)
return feed_dict, fetch_dict, program return feed_dict, fetch_dict, program
...@@ -490,9 +495,10 @@ class Module: ...@@ -490,9 +495,10 @@ class Module:
# create module pb # create module pb
module_desc = module_desc_pb2.ModuleDesc() module_desc = module_desc_pb2.ModuleDesc()
logger.info("hub version is %s" % version.hub_version) logger.info("PaddleHub version = %s" % version.hub_version)
logger.info("module proto version is %s" % version.module_proto_version) logger.info("PaddleHub Module proto version = %s" %
logger.info("paddle version is %s" % paddle.__version__) version.module_proto_version)
logger.info("Paddle version = %s" % paddle.__version__)
feeded_var_names = [ feeded_var_names = [
input.name for key, sign in self.signatures.items() input.name for key, sign in self.signatures.items()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册