Multitask分类任务训练loss为nan
Created by: yuanyc06
目前我这边遇到了一个loss变为nan的问题,具体背景是这样:我打算将一个现有简单分类网络变为multitask,即将最后的单个fc层变为2个分支(之前的主干不变,只是最后接出两个并列的fc层),每个fc对应一个独立的分类任务。目前发现训练不久后loss变为极大,之后变为nan。试过了减小lr和gradient clipping,都不管用。试了用caffe代码实现同样的网络,训练时loss正常;也试了单独各自训练两个task(每次只有对应的1个fc),loss也都正常。感觉是我paddle实现的代码哪里有问题,train.py里几个主要函数net_config, build_program, train相关代码如下,对应args.multitask=True的分支:
def net_config(image, label, model, args):
model_list = [m for m in dir(models) if "__" not in m]
assert args.model in model_list,"{} is not lists: {}".format(
args.model, model_list)
class_dim = args.class_dim
class_dim_mt = json.loads(args.class_dim_mt)
model_name = args.model
if args.enable_ce:
assert model_name == "SE_ResNeXt50_32x4d"
model.params["dropout_seed"] = 100
class_dim = 102
if model_name == "GoogleNet":
out0, out1, out2 = model.net(input=image, class_dim=class_dim)
cost0 = fluid.layers.cross_entropy(input=out0, label=label)
cost1 = fluid.layers.cross_entropy(input=out1, label=label)
cost2 = fluid.layers.cross_entropy(input=out2, label=label)
avg_cost0 = fluid.layers.mean(x=cost0)
avg_cost1 = fluid.layers.mean(x=cost1)
avg_cost2 = fluid.layers.mean(x=cost2)
avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
#acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
acc_top5 = acc_top1
elif model_name == 'SE_ResNeXt50_32x4d' and args.multitask == True:
# in this case label is a list
out0, out1 = model.net(input=image, class_dim=class_dim_mt, multitask=True)
cost0, pred0 = fluid.layers.softmax_with_cross_entropy(out0, label[0], return_softmax=True, ignore_index=-1)
cost1, pred1 = fluid.layers.softmax_with_cross_entropy(out1, label[1], return_softmax=True, ignore_index=-1)
if args.gradient_clipping:
gc = float(args.gradient_clipping_th)
cost0 = fluid.layers.clip(cost0, -gc, gc)
cost1 = fluid.layers.clip(cost1, -gc, gc)
avg_cost0 = fluid.layers.mean(x=cost0)
avg_cost1 = fluid.layers.mean(x=cost1)
avg_cost = avg_cost0 + avg_cost1
if args.scale_loss > 1:
avg_cost = avg_cost * float(args.scale_loss)
acc_top1_0 = fluid.layers.accuracy(input=pred0, label=label[0], k=1)
acc_top5_0 = acc_top1_0
acc_top1_1 = fluid.layers.accuracy(input=pred1, label=label[1], k=1)
acc_top5_1 = acc_top1_1
return avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1
else:
print('label:', label)
out = model.net(input=image, class_dim=class_dim)
cost, pred = fluid.layers.softmax_with_cross_entropy(out, label, return_softmax=True)
if args.scale_loss > 1:
avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss)
else:
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
#acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
acc_top5 = acc_top1
return avg_cost, acc_top1, acc_top5
def build_program(is_train, main_prog, startup_prog, args):
image_shape = [int(m) for m in args.image_shape.split(",")]
model_name = args.model
model_list = [m for m in dir(models) if "__" not in m]
assert model_name in model_list, "{} is not in lists: {}".format(args.model,
model_list)
model = models.__dict__[model_name]()
with fluid.program_guard(main_prog, startup_prog):
if not args.multitask:
py_reader = fluid.layers.py_reader(
capacity=16,
shapes=[[-1] + image_shape, [-1, 1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
use_double_buffer=True)
else:
py_reader = fluid.layers.py_reader(
capacity=16,
shapes=[[-1] + image_shape, [-1, 1], [-1, 1]],
lod_levels=[0, 0, 0],
dtypes=["float32", "int64", "int64"],
use_double_buffer=True)
with fluid.unique_name.guard():
if not args.multitask:
image, label = fluid.layers.read_file(py_reader)
else:
image, label0, label1 = fluid.layers.read_file(py_reader)
label = [label0, label1]
if args.fp16:
image = fluid.layers.cast(image, "float16")
if not args.multitask:
avg_cost, acc_top1, acc_top5 = net_config(image, label, model, args)
avg_cost.persistable = True
acc_top1.persistable = True
acc_top5.persistable = True
else:
avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1 = net_config(image, label, model, args)
avg_cost.persistable = True
acc_top1_0.persistable = True
acc_top5_0.persistable = True
acc_top1_1.persistable = True
acc_top5_1.persistable = True
if is_train:
params = model.params
params["total_images"] = args.total_images
params["lr"] = args.lr
params["num_epochs"] = args.num_epochs
params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy
optimizer = optimizer_setting(params)
if args.fp16:
params_grads = optimizer.backward(avg_cost)
master_params_grads = create_master_params_grads(
params_grads, main_prog, startup_prog, args.scale_loss)
optimizer.apply_gradients(master_params_grads)
master_param_to_train_param(master_params_grads, params_grads, main_prog)
else:
optimizer.minimize(avg_cost)
if not args.multitask:
return py_reader, avg_cost, acc_top1, acc_top5
else:
return py_reader, avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1
def train(args):
# parameters from arguments
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
model_save_dir = args.model_save_dir
startup_prog = fluid.Program()
train_prog = fluid.Program()
test_prog = fluid.Program()
if args.enable_ce:
startup_prog.random_seed = 1000
train_prog.random_seed = 1000
print('start build_program')
if not args.multitask:
print('single task')
train_py_reader, train_cost, train_acc1, train_acc5 = build_program(
is_train=True,
main_prog=train_prog,
startup_prog=startup_prog,
args=args)
test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
is_train=False,
main_prog=test_prog,
startup_prog=startup_prog,
args=args)
else:
print('multi task')
train_py_reader, train_cost, train_acc1_0, train_acc5_0, train_acc1_1, train_acc5_1 = build_program(
is_train=True,
main_prog=train_prog,
startup_prog=startup_prog,
args=args)
test_py_reader, test_cost, test_acc1_0, test_acc5_0, test_acc1_1, test_acc5_1 = build_program(
is_train=False,
main_prog=test_prog,
startup_prog=startup_prog,
args=args)
print('finished build_program')
test_prog = test_prog.clone(for_test=True)
if with_memory_optimization:
fluid.memory_optimize(train_prog)
fluid.memory_optimize(test_prog)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
if checkpoint is not None:
fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
if pretrained_model:
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
if visible_device:
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')
train_batch_size = args.batch_size / device_num
test_batch_size = 32
if not args.enable_ce:
train_reader = paddle.batch(
reader.train(file_list=args.train_list, data_dir=args.data_dir), batch_size=train_batch_size, drop_last=True)
test_reader = paddle.batch(reader.val(file_list=args.test_list, data_dir=args.data_dir), batch_size=test_batch_size)
else:
# use flowers dataset for CE and set use_xmap False to avoid disorder data
# but it is time consuming. For faster speed, need another dataset.
import random
random.seed(0)
np.random.seed(0)
train_reader = paddle.batch(
flowers.train(use_xmap=False),
batch_size=train_batch_size,
drop_last=True)
test_reader = paddle.batch(
flowers.test(use_xmap=False), batch_size=test_batch_size)
train_py_reader.decorate_paddle_reader(train_reader)
test_py_reader.decorate_paddle_reader(test_reader)
train_exe = fluid.ParallelExecutor(
main_program=train_prog,
use_cuda=bool(args.use_gpu),
loss_name=train_cost.name)
print('start training')
if not args.multitask:
print('single training')
train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
params = models.__dict__[args.model]().params
for pass_id in range(params["num_epochs"]):
train_py_reader.start()
train_info = [[], [], []]
test_info = [[], [], []]
train_time = []
batch_id = 0
try:
while True:
t1 = time.time()
loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list)
t2 = time.time()
period = t2 - t1
loss = np.mean(np.array(loss))
acc1 = np.mean(np.array(acc1))
acc5 = np.mean(np.array(acc5))
train_info[0].append(loss)
train_info[1].append(acc1)
train_info[2].append(acc5)
train_time.append(period)
if batch_id % 100 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
.format(pass_id, batch_id, loss, acc1, acc5,
"%2.2f sec" % period))
sys.stdout.flush()
batch_id += 1
except fluid.core.EOFException:
train_py_reader.reset()
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
train_speed = np.array(train_time).mean() / (train_batch_size * device_num)
test_py_reader.start()
test_batch_id = 0
try:
while True:
t1 = time.time()
loss, acc1, acc5 = exe.run(program=test_prog,
fetch_list=test_fetch_list)
t2 = time.time()
period = t2 - t1
loss = np.mean(loss)
acc1 = np.mean(acc1)
acc5 = np.mean(acc5)
test_info[0].append(loss)
test_info[1].append(acc1)
test_info[2].append(acc5)
if test_batch_id % 100 == 0:
print("Pass {0},testbatch {1},loss {2}, \
acc1 {3},acc5 {4},time {5}"
.format(pass_id, test_batch_id, loss, acc1, acc5,
"%2.2f sec" % period))
sys.stdout.flush()
test_batch_id += 1
except fluid.core.EOFException:
test_py_reader.reset()
test_loss = np.array(test_info[0]).mean()
test_acc1 = np.array(test_info[1]).mean()
test_acc5 = np.array(test_info[2]).mean()
print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
"test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
pass_id, train_loss, train_acc1, train_acc5, test_loss,
test_acc1, test_acc5))
sys.stdout.flush()
model_path = os.path.join(model_save_dir + '/' + model_name,
str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path, main_program=train_prog)
# This is for continuous evaluation only
if args.enable_ce and pass_id == args.num_epochs - 1:
if device_num == 1:
# Use the mean cost/acc for training
print("kpis train_cost %s" % train_loss)
print("kpis train_acc_top1 %s" % train_acc1)
print("kpis train_acc_top5 %s" % train_acc5)
# Use the mean cost/acc for testing
print("kpis test_cost %s" % test_loss)
print("kpis test_acc_top1 %s" % test_acc1)
print("kpis test_acc_top5 %s" % test_acc5)
print("kpis train_speed %s" % train_speed)
else:
# Use the mean cost/acc for training
print("kpis train_cost_card%s %s" % (device_num, train_loss))
print("kpis train_acc_top1_card%s %s" %
(device_num, train_acc1))
print("kpis train_acc_top5_card%s %s" %
(device_num, train_acc5))
# Use the mean cost/acc for testing
print("kpis test_cost_card%s %s" % (device_num, test_loss))
print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1))
print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5))
print("kpis train_speed_card%s %s" % (device_num, train_speed))
else:
print('multi training')
train_fetch_list = [train_cost.name, train_acc1_0.name, train_acc5_0.name, train_acc1_1.name, train_acc5_1.name]
test_fetch_list = [test_cost.name, test_acc1_0.name, test_acc5_0.name, test_acc1_1.name, test_acc5_1.name]
params = models.__dict__[args.model]().params
for pass_id in range(params["num_epochs"]):
train_py_reader.start()
train_info = [[], [], [], [], []]
test_info = [[], [], [], [], []]
train_time = []
batch_id = 0
try:
while True:
t1 = time.time()
loss, acc1_0, acc5_0, acc1_1, acc5_1 = train_exe.run(fetch_list=train_fetch_list)
t2 = time.time()
period = t2 - t1
loss = np.mean(np.array(loss))
acc1_0 = np.mean(np.array(acc1_0))
acc5_0 = np.mean(np.array(acc5_0))
acc1_1 = np.mean(np.array(acc1_1))
acc5_1 = np.mean(np.array(acc5_1))
train_info[0].append(loss)
train_info[1].append(acc1_0)
train_info[2].append(acc5_0)
train_info[3].append(acc1_1)
train_info[4].append(acc5_1)
train_time.append(period)
if batch_id % 10 == 0:
#print("Pass {0}, trainbatch {1}, loss {2}, \
# acc1_0 {3}, acc5_0 {4}, acc1_1 {5}, acc5_1 {6}, time {7}"
# .format(pass_id, batch_id, loss, acc1_0, acc5_0, acc1_1, acc5_1,
# "%2.2f sec" % period))
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1_0 {3}, acc1_1 {4}, time {5}"
.format(pass_id, batch_id, loss, acc1_0, acc1_1, "%2.2f sec" % period))
sys.stdout.flush()
batch_id += 1
except fluid.core.EOFException:
train_py_reader.reset()
train_loss = np.array(train_info[0]).mean()
train_acc1_0 = np.array(train_info[1]).mean()
train_acc5_0 = np.array(train_info[2]).mean()
train_acc1_1 = np.array(train_info[3]).mean()
train_acc5_1 = np.array(train_info[4]).mean()
train_speed = np.array(train_time).mean() / (train_batch_size * device_num)
test_py_reader.start()
test_batch_id = 0
try:
while True:
t1 = time.time()
loss, acc1_0, acc5_0, acc1_1, acc5_1 = exe.run(program=test_prog,
fetch_list=test_fetch_list)
t2 = time.time()
period = t2 - t1
loss = np.mean(loss)
acc1_0 = np.mean(acc1_0)
acc5_0 = np.mean(acc5_0)
acc1_1 = np.mean(acc1_1)
acc5_1 = np.mean(acc5_1)
test_info[0].append(loss)
test_info[1].append(acc1_0)
test_info[2].append(acc5_0)
test_info[3].append(acc1_1)
test_info[4].append(acc5_1)
if test_batch_id % 10 == 0:
#print("Pass {0},testbatch {1},loss {2}, \
# acc1_0 {3},acc5_0 {4},acc1_1 {5},acc5_1 {6},time {7}"
# .format(pass_id, test_batch_id, loss, acc1_0, acc5_0, acc1_1, acc5_1,
# "%2.2f sec" % period))
print("Pass {0},testbatch {1},loss {2}, \
acc1_0 {3}, acc1_1 {4}, time {5}"
.format(pass_id, test_batch_id, loss, acc1_0, acc1_1, "%2.2f sec" % period))
sys.stdout.flush()
test_batch_id += 1
except fluid.core.EOFException:
test_py_reader.reset()
test_loss = np.array(test_info[0]).mean()
test_acc1_0 = np.array(test_info[1]).mean()
test_acc5_0 = np.array(test_info[2]).mean()
test_acc1_1 = np.array(test_info[3]).mean()
test_acc5_1 = np.array(test_info[4]).mean()
#print("End pass {0}, train_loss {1}, train_acc1_0 {2}, train_acc5_0 {3}, train_acc1_1 {4}, train_acc5_1 {5}, "
# "test_loss {6}, test_acc1_0 {7}, test_acc5_0 {8}, test_acc1_1 {9}, test_acc5_1 {10}".format(
# pass_id, train_loss, train_acc1_0, train_acc5_0, train_acc1_1, train_acc5_1, test_loss,
# test_acc1_0, test_acc5_0, test_acc1_1, test_acc5_1))
print("End pass {0}, train_loss {1}, train_acc1_0 {2}, train_acc1_1 {3}, "
"test_loss {4}, test_acc1_0 {5}, test_acc1_1 {6}".format(
pass_id, train_loss, train_acc1_0, train_acc1_1, test_loss,
test_acc1_0, test_acc1_1))
sys.stdout.flush()
model_path = os.path.join(model_save_dir + '/' + model_name,
str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path, main_program=train_prog)