提交 9c96edfe 编写于 作者: Z Zeyu Chen

refactor save_checkpoint, load_checkpoint, memory_optimization, add more logs

上级 4920392b
......@@ -58,7 +58,7 @@ if __name__ == '__main__':
config = FinetuneConfig(
log_interval=10,
eval_interval=100,
save_ckpt_interval=200,
save_ckpt_interval=50,
use_cuda=True,
checkpoint_dir=args.checkpoint_dir,
learning_rate=args.learning_rate,
......@@ -67,8 +67,7 @@ if __name__ == '__main__':
max_seq_len=args.max_seq_len,
weight_decay=args.weight_decay,
finetune_strategy="bert_finetune",
with_memory_optimization=True,
in_tokens=False,
enable_memory_optim=True,
optimizer=None,
warmup_proportion=args.warmup_proportion)
......
......@@ -3,17 +3,17 @@ export CUDA_VISIBLE_DEVICES=5
DATA_PATH=./chnsenticorp_data
#HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
#HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
CKPT_DIR="./ckpt"
rm -rf $CKPT_DIR
#rm -rf $CKPT_DIR
python -u finetune_with_hub.py \
--batch_size 64 \
--batch_size 128 \
--hub_module_dir=$HUB_MODULE_DIR \
--data_dir ${DATA_PATH} \
--weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \
--warmup_proportion 0.0 \
--epoch 3 \
--max_seq_len 128 \
--epoch 2 \
--max_seq_len 16 \
--learning_rate 5e-5
......@@ -171,7 +171,7 @@ def connect_program(pre_program, next_program, input_dict=None, inplace=True):
outputs={'Out': output_var})
block_map = {0: 0}
logger.info("start to connect program")
logger.info("Connect program's input tensor")
for index, block in enumerate(next_program.blocks):
if block.idx == 0:
_copy_vars_and_ops_in_blocks(block, output_program.global_block())
......@@ -183,14 +183,13 @@ def connect_program(pre_program, next_program, input_dict=None, inplace=True):
new_block = output_program._create_block(
parent_idx=block_map[block.parent_idx])
_copy_vars_and_ops_in_blocks(block, new_block)
logger.info("end of connect program")
logger.info("Connect program's input tensor done")
return output_program
def remove_feed_fetch_op(program):
""" remove feed and fetch operator and variable for fine-tuning
"""
logger.info("remove feed fetch op")
block = program.global_block()
need_to_remove_op_index = []
for i, op in enumerate(block.ops):
......
// Copyright 2018 The Paddle Authors. All Rights Reserved.
// Copyright 2019 The Paddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -19,7 +19,7 @@ option optimize_for = LITE_RUNTIME;
package paddle_hub_finetune_checkpoint;
message CheckPoint {
int64 last_epoch = 1;
int64 last_step = 2;
string last_model_dir = 3;
int64 current_epoch = 1;
int64 global_step = 2;
string latest_model_dir = 3;
}
......@@ -16,20 +16,50 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddle.fluid as fluid
from paddle_hub.finetune import checkpoint_pb2
from paddle_hub.common.logger import logger
CKPT_FILE_NAME = "ckpt.meta"
def load_checkpoint(checkpoint_path):
ckpt = checkpoint_pb2.CheckPoint()
with open(checkpoint_path, "rb") as file:
ckpt.ParseFromString(file.read())
return ckpt.last_epoch, ckpt.last_step, ckpt.last_model_dir
def load_checkpoint(checkpoint_dir, exe):
ckpt_meta_path = os.path.join(checkpoint_dir, CKPT_FILE_NAME)
logger.info("Try loading checkpoint from {}".format(ckpt_meta_path))
if os.path.exists(ckpt_meta_path):
ckpt = checkpoint_pb2.CheckPoint()
with open(ckpt_meta_path, "rb") as f:
ckpt.ParseFromString(f.read())
fluid.io.load_persistables(exe, ckpt.latest_model_dir)
logger.info("Checkpoint loaded. current_epoch={},"
"global_step={}".format(ckpt_meta_path, current_epoch,
global_step))
return ckpt.current_epoch, ckpt.global_step
else:
current_epoch = 1
global_step = 0
latest_model_dir = None
logger.info("Checkpoint not found, start training from scratch...")
exe.run(fluid.default_startup_program())
def save_checkpoint(checkpoint_path, last_epoch, last_step, last_model_dir):
return current_epoch, global_step
def save_checkpoint(checkpoint_dir, current_epoch, global_step, exe):
ckpt_meta_path = os.path.join(checkpoint_dir, CKPT_FILE_NAME)
ckpt = checkpoint_pb2.CheckPoint()
ckpt.last_epoch = last_epoch
ckpt.last_step = last_step
ckpt.last_model_dir = last_model_dir
with open(checkpoint_path, "wb") as file:
file.write(ckpt.SerializeToString())
model_saved_dir = os.path.join(checkpoint_dir, "step_%d" % global_step)
logger.info("Saving model checkpoint to {}".format(model_saved_dir))
fluid.io.save_persistables(exe, dirname=model_saved_dir)
ckpt.current_epoch = current_epoch
ckpt.global_step = global_step
ckpt.latest_model_dir = model_saved_dir
with open(ckpt_meta_path, "wb") as f:
f.write(ckpt.SerializeToString())
......@@ -7,7 +7,6 @@ from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
......@@ -16,10 +15,10 @@ DESCRIPTOR = _descriptor.FileDescriptor(
name='checkpoint.proto',
package='paddle_hub_finetune_checkpoint',
syntax='proto3',
serialized_options=_b('H\003'),
serialized_pb=_b(
'\n\x10\x63heckpoint.proto\x12\x1epaddle_hub_finetune_checkpoint\"K\n\nCheckPoint\x12\x12\n\nlast_epoch\x18\x01 \x01(\x03\x12\x11\n\tlast_step\x18\x02 \x01(\x03\x12\x16\n\x0elast_model_dir\x18\x03 \x01(\tB\x02H\x03\x62\x06proto3'
'\n\x10\x63heckpoint.proto\x12\x1epaddle_hub_finetune_checkpoint\"R\n\nCheckPoint\x12\x15\n\rcurrent_epoch\x18\x01 \x01(\x03\x12\x13\n\x0bglobal_step\x18\x02 \x01(\x03\x12\x18\n\x10latest_model_dir\x18\x03 \x01(\tB\x02H\x03\x62\x06proto3'
))
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
_CHECKPOINT = _descriptor.Descriptor(
name='CheckPoint',
......@@ -29,8 +28,8 @@ _CHECKPOINT = _descriptor.Descriptor(
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='last_epoch',
full_name='paddle_hub_finetune_checkpoint.CheckPoint.last_epoch',
name='current_epoch',
full_name='paddle_hub_finetune_checkpoint.CheckPoint.current_epoch',
index=0,
number=1,
type=3,
......@@ -43,10 +42,11 @@ _CHECKPOINT = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
serialized_options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='last_step',
full_name='paddle_hub_finetune_checkpoint.CheckPoint.last_step',
name='global_step',
full_name='paddle_hub_finetune_checkpoint.CheckPoint.global_step',
index=1,
number=2,
type=3,
......@@ -59,10 +59,12 @@ _CHECKPOINT = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
serialized_options=None,
file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='last_model_dir',
full_name='paddle_hub_finetune_checkpoint.CheckPoint.last_model_dir',
name='latest_model_dir',
full_name=
'paddle_hub_finetune_checkpoint.CheckPoint.latest_model_dir',
index=2,
number=3,
type=9,
......@@ -75,21 +77,23 @@ _CHECKPOINT = _descriptor.Descriptor(
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
serialized_options=None,
file=DESCRIPTOR),
],
extensions=[],
nested_types=[],
enum_types=[],
options=None,
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[],
serialized_start=52,
serialized_end=127,
serialized_end=134,
)
DESCRIPTOR.message_types_by_name['CheckPoint'] = _CHECKPOINT
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
CheckPoint = _reflection.GeneratedProtocolMessageType(
'CheckPoint',
......@@ -101,7 +105,5 @@ CheckPoint = _reflection.GeneratedProtocolMessageType(
))
_sym_db.RegisterMessage(CheckPoint)
DESCRIPTOR.has_options = True
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(),
_b('H\003'))
DESCRIPTOR._options = None
# @@protoc_insertion_point(module_scope)
......@@ -14,24 +14,93 @@
import collections
FinetuneConfig = collections.namedtuple(
'FinetuneConfig',
[
'log_interval', # print training log every n step
'eval_interval', # evalution the model every n steps
'save_ckpt_interval', # save the model checkpoint every n steps
'use_cuda', # use gpu or not
'learning_rate',
'checkpoint_dir', # model checkpoint directory
'num_epoch', # number of finetune epoch
'batch_size',
# for bert parameter
'max_seq_len', # for bert
'weight_decay', # for bert
'warmup_proportion', # for bert
'in_tokens', # for bert
'finetune_strategy',
'with_memory_optimization',
# learning rate scheduler
'optimizer'
])
class FinetuneConfig(object):
""" This class specifies the configurations for PaddleHub to finetune """
def __init__(self,
log_interval=10,
eval_interval=100,
save_ckpt_interval=None,
use_cuda=False,
learning_rate=1e-4,
checkpoint_dir=None,
num_epoch=10,
batch_size=None,
max_seq_len=128,
weight_decay=None,
warmup_proportion=0.0,
finetune_strategy=None,
enable_memory_optim=True,
optimizer="adam"):
""" Construct finetune Config """
self._log_interval = log_interval
self._eval_interval = eval_interval
self._save_ckpt_interval = save_ckpt_interval
self._use_cuda = use_cuda
self._learning_rate = learning_rate
self._checkpoint_dir = checkpoint_dir
self._num_epoch = num_epoch
self._batch_size = batch_size
self._max_seq_len = max_seq_len
self._weight_decay = weight_decay
self._warmup_proportion = warmup_proportion
self._finetune_strategy = finetune_strategy
self._enable_memory_optim = enable_memory_optim
self._optimizer = optimizer
@property
def log_interval(self):
return self._log_interval
@property
def eval_interval(self):
return self._eval_interval
@property
def save_ckpt_interval(self):
return self._save_ckpt_interval
@property
def use_cuda(self):
return self._use_cuda
@property
def learning_rate(self):
return self._learning_rate
@property
def checkpoint_dir(self):
return self._checkpoint_dir
@property
def num_epoch(self):
return self._num_epoch
@property
def batch_size(self):
return self._batch_size
@property
def max_seq_len(self):
return self._max_seq_len
@property
def weight_decay(self):
return self._weight_decay
@property
def warmup_proportion(self):
return self._warmup_proportion
@property
def finetune_strategy(self):
return self._finetune_strategy
@property
def enable_memory_optim(self):
return self._enable_memory_optim
@property
def optimier(self):
return self._optimizer
......@@ -27,8 +27,6 @@ from paddle_hub.common.logger import logger
from paddle_hub.finetune.optimization import bert_finetune
from paddle_hub.finetune.checkpoint import load_checkpoint, save_checkpoint
CKPT_FILE = "ckpt.meta"
def _get_running_device_info(config):
if config.use_cuda:
......@@ -41,6 +39,27 @@ def _get_running_device_info(config):
return place, dev_count
def _do_memory_optimization(task, config):
if config.enable_memory_optim:
logger.info("Memory optimization start...")
task_var_name = task.metric_variable_names()
logger.info(
"Skip memory optimization on variables: {}".format(task_var_name))
optimize_time_begin = time.time()
fluid.memory_optimize(
input_program=fluid.default_main_program(),
# skip memory optimization on task metric variables
skip_opt_set=task_var_name)
time_used = time.time() - optimize_time_begin
logger.info("Memory optimization done! Time elapsed %f sec" % time_used)
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=fluid.default_main_program(), batch_size=config.batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit)),
def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
main_program = task.main_program()
startup_program = task.startup_program()
......@@ -50,14 +69,11 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
num_epoch = config.num_epoch
batch_size = config.batch_size
learning_rate = config.learning_rate
with_memory_optimization = config.with_memory_optimization
checkpoint_path = os.path.join(config.checkpoint_dir, CKPT_FILE)
log_writter = LogWriter(
os.path.join(config.checkpoint_dir, "vdllog"), sync_cycle=10)
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
......@@ -69,33 +85,10 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
optimizer.minimize(loss)
#TODO: add more finetune strategy
if with_memory_optimization:
logger.info("Memory optimization start...")
optimize_time_begin = time.time()
fluid.memory_optimize(
input_program=fluid.default_main_program(),
skip_opt_set=[
# skip task graph variable memory optimization
loss.name,
accuracy.name
])
time_used = time.time() - optimize_time_begin
logger.info(
"Memory optimization done! Time elapsed %f sec" % time_used)
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=main_program, batch_size=batch_size)
logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit)),
# initilize
if os.path.exists(checkpoint_path):
last_epoch, global_step, last_model_dir = load_checkpoint(
checkpoint_path)
fluid.io.load_persistables(exe, last_model_dir)
else:
exe.run(fluid.default_startup_program())
global_step = 0
last_epoch = 1
_do_memory_optimization(task, config)
# Try to restore model training checkpoint
current_epoch, global_step = load_checkpoint(config.checkpoint_dir, exe)
best_eval_acc = 0.0
train_time_used = 0
......@@ -109,7 +102,7 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
eval_loss_scalar = logw.scalar(tag="loss[evaluate]")
eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")
for epoch in range(last_epoch, num_epoch + 1):
for epoch in range(current_epoch, num_epoch + 1):
train_reader = data_reader.data_generator(
batch_size=batch_size, phase='train')
num_trained_examples = acc_sum = loss_sum = 0
......@@ -141,16 +134,16 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
num_trained_examples = acc_sum = loss_sum = 0
if global_step % config.save_ckpt_interval == 0:
model_saved_dir = os.path.join(
config.checkpoint_dir, "model_in_step_%d" % global_step)
model_saved_dir = os.path.join(config.checkpoint_dir,
"step_%d" % global_step)
fluid.io.save_persistables(exe, dirname=model_saved_dir)
# NOTE: current saved checkpoint machanism is not completed,
# it can't restore dataset training status
save_checkpoint(
checkpoint_path,
last_epoch=epoch,
last_step=global_step,
last_model_dir=model_saved_dir)
checkpoint_dir=config.checkpoint_dir,
current_epoch=epoch,
global_step=global_step,
exe=exe)
if do_eval and global_step % config.eval_interval == 0:
eval_loss, eval_acc, eval_perf = evaluate(
......@@ -176,10 +169,10 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
save_checkpoint(
checkpoint_path,
last_epoch=num_epoch + 1,
last_step=global_step,
last_model_dir=model_saved_dir)
checkpoint_dir=config.checkpoint_dir,
current_epoch=num_epoch + 1,
global_step=global_step,
exe=exe)
if do_eval:
evaluate(task, data_reader, feed_list, phase="test", config=config)
......
......@@ -43,3 +43,10 @@ class Task(object):
def inference_program(self):
return self._inference_program
def metric_variable_names(self):
metric_variable_names = []
for var_name in self.graph_var_dict:
metric_variable_names.append(var_name)
return metric_variable_names
......@@ -464,7 +464,7 @@ class Module(object):
"max_seq_len({}) should be in the range of [1, {}]".format(
MAX_SEQ_LENGTH))
logger.info(
"update maximum sequence length of input tensor to {}".format(
"Set maximum sequence length of input tensor to {}".format(
max_seq_len))
for tensor_name in [
"input_ids", "position_ids", "segment_ids", "input_mask"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册