rank_loss为inf
Created by: yuyy0823
-
版本、环境信息: 1)PaddlePaddle版本:1.5.2gpu版 2)GPU:CUDA Version 8.0.44 CUDNN版本号 3)系统环境:CentOS release 6.3 (Final),Python2.7.3
-
训练信息 1)单机多卡 2)NVIDIA-SMI 384.81 Driver Version: 384.81
-
问题描述:训练ranknet模型,训练过程中loss出现inf,即使lr设置为1e-5,也会出现loss为inf,有时候前几个epoch出现inf,后面训练又变正常不再出现inf,有时候前几个epoch出现inf,中间几个epoch正常,没有inf,然后再过几个epoch又出现inf,感觉很奇怪,麻烦相关同学帮忙分析下原因。 使用check_nan_inf检测,日志结果如下: C++ Call stacks: Operator rank_loss output Tensor traintrainrank_loss_0.tmp_0 contains Inf at [/paddle/paddle/fluid/framework/operator.cc:841] PaddlePaddle Call Stacks: 0 0x7f3c481bd4f0p void paddle::platform::EnforceNotMet::Init<char const*>(char const*, char const*, int) + 352 1 0x7f3c481bd869p paddle::platform::EnforceNotMet::EnforceNotMet(std::exception_ptr::exception_ptr, char const*, int) + 137 2 0x7f3c4a24ec6bp 3 0x7f3c4a2534f0p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, paddle::framework::RuntimeContext*) const + 560 4 0x7f3c4a253811p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 529 5 0x7f3c4a24e3a3p paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 307 6 0x7f3c4a04421ap paddle::framework::details::ComputationOpHandle::RunImpl() + 250 7 0x7f3c4a0352e0p paddle::framework::details::OpHandleBase::Run(bool) + 160 8 0x7f3c4a016656p paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*) + 310 9 0x7f3c4a0152bfp paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue > const&, unsigned long*) + 47 10 0x7f3c4a01567fp 11 0x7f3c483f4ad3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()(), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&) + 35 12 0x7f3c48288b37p std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()()>&, bool&) + 39 13 0x38c040cbe0p pthread_once + 80 14 0x7f3c4a010d02p 15 0x7f3c4828a0b4p _ZZN10ThreadPoolC1EmENKUlvE_clEv + 404 16 0x38c30b6470p 17 0x38c0407df3p 18 0x38bfcf62cdp clone + 109
网路结构:
def half_ranknet(input):
"""
rank_net
"""
hidden1 = fluid.layers.fc(input=input, size=256, act='relu', name='fc1')
hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu', name='fc2')
hidden3 = fluid.layers.fc(input=hidden2, size=64, act='relu', name='fc3')
hidden4 = fluid.layers.fc(input=hidden3, size=64, act='relu', name='fc4')
hidden5 = fluid.layers.fc(input=hidden4, size=32, act='relu', name='fc5')
pre = fluid.layers.fc(input=hidden5, size=1, name='out')
pre_scale = fluid.layers.scale(pre, 1.0)
return pre_scale
主要训练代码
def parse_line(line):
tokens = line.rstrip().split("\t")[2:]
feats1 = tokens[:100]
feats2 = tokens[100:-1]
#return np.array(feats1), np.array(feats2), float(tokens[-1])
return np.array(map(float, feats1)), np.array(map(float, feats2)), float(tokens[-1])
def read_data(data_file):
def reader():
with open(data_file, 'r') as f:
for line in f:
yield parse_line(line)
return reader
def train_reader():
global train_data_file
return read_data(train_data_file)
def test_reader():
global test_data_file
return read_data(test_data_file)
def optimizer_setting(args):
#boundaries = [100000, 200000, 300000]
#decay = [1.0, 0.5, 0.2, 0.1]
boundaries = map(int, args.boundaries.split(','))
decay = map(float, args.decay.split(','))
#lr = 0.0005
lr = args.lr
values = [lr * i for i in decay]
optimizer = fluid.optimizer.AdamaxOptimizer(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005), )
return optimizer
def build_program(main_prog, startup_prog, is_train, network, args):
outs = []
with fluid.program_guard(main_prog, startup_prog):
py_reader = fluid.layers.py_reader(
capacity=10000,
shapes=((-1, 100), (-1, 100), (-1, 1)),
dtypes=["float32", "float32", "float32"],
use_double_buffer=True)
with fluid.unique_name.guard():
feats1, feats2, label = fluid.layers.read_file(py_reader)
output_left = half_ranknet(feats1)
output_right = half_ranknet(feats2)
if is_train:
with fluid.unique_name.guard("train"):
loss = fluid.layers.rank_loss(label=label, left=output_left, right=output_right)
loss = fluid.layers.mean(loss)
optimizer = optimizer_setting(args)
optimizer.minimize(loss)
outs = [py_reader, loss, feats1, output_left]
else:
with fluid.unique_name.guard("inference"):
loss = fluid.layers.rank_loss(label=label, left=output_left, right=output_right)
loss = fluid.layers.mean(loss)
predict = fluid.layers.softmax(fluid.layers.concat(input=[output_right, output_left], axis=1))
accuracy = fluid.layers.accuracy(input=predict, label=fluid.layers.cast(label, dtype='int64'))
outs = [py_reader, loss, accuracy]
return outs
def train(params_dirname, network, args):
model_save_dir = params_dirname
# pretrained_model = args.pretrained_model
use_gpu = args.use_gpu
parallel = True
#parallel = False
enable_ce = True
is_shuffle = True
if not use_gpu:
devices_num = int(os.environ.get('CPU_NUM',
multiprocessing.cpu_count()))
else:
devices_num = fluid.core.get_cuda_device_count()
#devices_num = 2
batch_size = args.batch_size
epoc_num = args.epoch_num
iters_per_epoc = 10000000
startup_prog = fluid.Program()
train_prog = fluid.Program()
test_prog = fluid.Program()
if enable_ce:
import random
random.seed(0)
np.random.seed(0)
is_shuffle = False
startup_prog.random_seed = 111
train_prog.random_seed = 111
test_prog.random_seed = 111
train_py_reader, loss, feats, predict = build_program(
main_prog=train_prog,
startup_prog=startup_prog,
is_train=True,
network=network,
args=args)
test_py_reader, test_loss, accuracy = build_program(
main_prog=test_prog,
startup_prog=startup_prog,
is_train=False,
network=network,
args=args)
test_prog = test_prog.clone(for_test=True)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
if parallel:
train_exe = fluid.ParallelExecutor(main_program=train_prog,
use_cuda=use_gpu, loss_name=loss.name)
train_data_reader = paddle.batch(
reader=paddle.reader.shuffle(train_reader(), buf_size=1000),
batch_size=batch_size)
train_py_reader.decorate_paddle_reader(train_data_reader)
test_data_reader = paddle.batch(
reader=paddle.reader.shuffle(test_reader(), buf_size=1000),
batch_size=10000)
test_py_reader.decorate_paddle_reader(test_data_reader)
def save_model(main_prog):
# if os.path.isdir(model_path):
# shutil.rmtree(model_path)
print('save models to %s' % (model_save_dir))
fluid.io.save_inference_model(model_save_dir, [feats.name], [predict], exe, train_prog, params_filename="__params__")
fluid.io.save_persistables(exe, model_save_dir, main_program=train_prog)
best_loss = -1.0
total_time = 0.0
global_batch = 0
for epoc_id in range(epoc_num):
try:
train_py_reader.start()
epoch_idx = epoc_id + 1
start_time = time.time()
every_epoc_loss = []
for batch_id in range(iters_per_epoc):
prev_start_time = start_time
if parallel:
loss_v, = train_exe.run(fetch_list=[loss.name])
else:
loss_v, = exe.run(train_prog, fetch_list=[loss.name])
loss_v = np.mean(np.array(loss_v))
every_epoc_loss.append(loss_v)
global_batch += 1
if global_batch % 1000 == 0:
start_time = time.time()
print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
epoc_id, global_batch, loss_v, start_time - prev_start_time))
end_time = time.time()
total_time += end_time - start_time
except (fluid.core.EOFException, StopIteration):
print 'End of epoch', epoc_id
train_py_reader.reset()
test_py_reader.start()
try:
count = 0
avg_test_loss = 0.0
avg_test_acc = 0.0
while True:
test_loss_, acc = exe.run(test_prog, fetch_list=[test_loss.name, accuracy])
avg_test_loss += test_loss_[0]
avg_test_acc += acc[0]
count += 1
if count % 5 == 0:
print ("test loss %f, accuracy %f" % (test_loss_[0], acc[0]))
except fluid.core.EOFException:
test_py_reader.reset()
avg_test_loss /= count
avg_test_acc /= count
if best_loss < 0 or avg_test_loss < best_loss:
save_model(test_prog)
best_loss = avg_test_loss
one_epoch_time = time.time() - start_time
print("Epoch time {:.5f} test loss={:.6f}, accuracy={:.6f}".format(one_epoch_time, avg_test_loss, avg_test_acc))