无法获取accuracy及保存模型报错
Created by: kjfren1985
我根据models里面的ctr例子(https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleRec/ctr/infer.py) 改写了一个脚本,主要在train的同时增加test过程,方便选择模型轮次,整体训练测试过程跑通,不过无法获取accuracy及保存模型报错
- fetch_list不增加accuracy.name可以正常返回,增加accuracy.name后会报错:
Traceback (most recent call last):
File "train.py", line 609, in <module>
train_test()
File "train.py", line 506, in train_test
loss_val, auc_val, batch_auc_val, accuracy_val = pe.run(fetch_list=[loss.name, auc_var.name, batch_auc_var.name, accuracy.name])
File "/home/XXX/paddlepaddle/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/parallel_executor.py", line 276, in run
return executor.as_numpy(arr)
File "/home/XXX/paddlepaddle/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 83, in as_numpy
return [as_numpy(t) for t in tensor]
File "/home/XXX/paddlepaddle/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/executor.py", line 92, in as_numpy
return LoDTensor itself directly.")
RuntimeError: Some of your fetched tensors hold LoD information. They can not be completely cast to Python ndarray. Please set the parameter 'return_numpy' as 'False' to return LoDTensor itself directly.
保存模型时候报错:
Traceback (most recent call last):
File "train.py", line 610, in <module>
train_test()
File "train.py", line 553, in train_test
fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], pe)
File "/home/kangjianfeng/paddlepaddle/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/io.py", line 681, in save_inference_model
main_program = main_program._prune(targets=target_vars)
File "/home/kangjianfeng/paddlepaddle/paddle_release_home/python/lib/python2.7/site-packages/paddle/fluid/framework.py", line 1741, in _prune
"The target variable must have an "
ValueError: The target variable must have an associated operator that generates it.
代码如下: network_conf.py与https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleRec/ctr/network_conf.py 基本相同,只有最后一行改了:
accuracy = fluid.layers.accuracy(input=predict, label=words[-1])
auc_var, batch_auc_var, auc_states = \
fluid.layers.auc(input=predict, label=words[-1], num_thresholds=2 ** 12, slide_steps=20)
return accuracy, avg_cost, auc_var, batch_auc_var, py_reader
train.py:
def train_test():
args = parse_args()
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
train_prog = fluid.Program()
train_startup = fluid.Program()
# Define train network
with fluid.program_guard(train_prog, train_startup):
# Use fluid.unique_name.guard() to share parameters with test network
with fluid.unique_name.guard():
accuracy, loss, auc_var, batch_auc_var, py_reader = ctr_dnn_model_new(args.embedding_size, args.sparse_feature_dim, True)
optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
optimizer.minimize(loss)
test_prog = fluid.Program()
test_startup = fluid.Program()
# Define test network
with fluid.program_guard(test_prog, test_startup):
# Use fluid.unique_name.guard() to share parameters with train network
with fluid.unique_name.guard():
accuracy_test, loss_test, auc_var_test, batch_auc_var_test, py_reader_test = ctr_dnn_model_new(args.embedding_size, args.sparse_feature_dim, False)
logger.info(args.is_local)
logger.info("run local training")
#main_program = fluid.default_main_program()
#train_loop_add_test(args, main_program, py_reader, loss, auc_var, batch_auc_var, 1, 0)
#train_loop_add_test code mv here
trainer_num = 1
trainer_id = 0
dataset = reader.CriteoDataset(args.sparse_feature_dim)
train_reader = paddle.batch(
paddle.reader.shuffle(
dataset.train([args.train_data_path], trainer_num, trainer_id),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
dataset.test([args.train_data_path]),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
py_reader.decorate_paddle_reader(train_reader)
py_reader_test.decorate_paddle_reader(test_reader)
data_name_list = []
place = fluid.CPUPlace()
#place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exec_strategy = fluid.ExecutionStrategy()
build_strategy = fluid.BuildStrategy()
if os.getenv("NUM_THREADS", ""):
exec_strategy.num_threads = int(os.getenv("NUM_THREADS"))
cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
build_strategy.reduce_strategy = \
fluid.BuildStrategy.ReduceStrategy.Reduce if cpu_num > 1 \
else fluid.BuildStrategy.ReduceStrategy.AllReduce
pe = fluid.ParallelExecutor(
use_cuda=False,
loss_name=loss.name,
main_program=train_prog,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
te = fluid.ParallelExecutor(
use_cuda=False,
loss_name=loss_test.name,
main_program=test_prog,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
#exe.run(fluid.default_startup_program())
# Run startup program
exe.run(train_startup)
exe.run(test_startup)
for pass_id in range(args.num_passes):
pass_start = time.time()
batch_id = 0
py_reader.start()
try:
while True:
loss_val, auc_val, batch_auc_val, accuracy_val = pe.run(fetch_list=[loss.name, auc_var.name, batch_auc_var.name, accuracy.name])
#loss_val, auc_val, batch_auc_val = pe.run(fetch_list=[loss.name, auc_var.name, batch_auc_var.name])
loss_val = np.mean(loss_val)
auc_val = np.mean(auc_val)
batch_auc_val = np.mean(batch_auc_val)
logger.info("TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}"
.format(pass_id, batch_id, loss_val/args.batch_size, auc_val, batch_auc_val))
if batch_id % 1000 == 0 and batch_id != 0:
model_dir = args.model_output_dir + '/batch-' + str(batch_id)
if args.trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)
batch_id += 1
except fluid.core.EOFException:
print('End of epoch', pass_id)
py_reader.reset()
print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start))
#add test pyreader
py_reader_test.start()
try:
while True:
print('test_%s', loss_test.name)
loss_val, auc_val, batch_auc_val = te.run(fetch_list=[loss_test.name, auc_var_test.name, batch_auc_var_test.name])
loss_val = np.mean(loss_val)
auc_val = np.mean(auc_val)
batch_auc_val = np.mean(batch_auc_val)
print('test...')
logger.info("TEST --> pass: {} loss: {} auc: {}, batch_auc: {}"
.format(pass_id, loss_val/args.batch_size, auc_val, batch_auc_val))
except fluid.core.EOFException:
print('End of testing')
py_reader_test.reset()
print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start))
#
model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if args.trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], pe)
环境:
- gpu机器,gpu版本paddle,gpu模式
- a)PaddlePaddle版本:1.2.0(内网一键安装paddlepaddle_gpu-1.2.0.post87-cp27-cp27mu-linux_x86_64.whl)
- b)系统环境:CentOS release 6.3 (Final)