MPI分布式训练test_program初始化问题
Created by: shuoyin
Paddle版本:1.5 训练环境MPI集群 采用preserver-trainer模式训练模型,其中包含一个分布式embedding词表,program创建代码为
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
graph_wrapper = pgl.graph_wrapper.GraphWrapper(
"sub_graph", place, node_feat=data['graph'].node_feat_info())
model_loss, recall, precision,all_acc = build_graph_model(
graph_wrapper,
hidden_size=args.hidden_size,
graphsage_type=args.graphsage_type,
k_hop=len(samples),num_node=data['graph'].num_nodes)
test_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
adam = fluid.optimizer.SGD(learning_rate=args.lr)
adam.minimize(model_loss)
训练代码如下
for epoch in range(args.epoch):
batch = 0
start = time.time()
start_batch = time.time()
batch_time = 0
for batch_feed_dict in train_iter():
end_batch = time.time()
batch_time += (end_batch-start_batch)
batch += 1
if batch%100==0: #print train log
outs = exe.run(train_program, feed=batch_feed_dict,
fetch_list=fetch_list)
end = time.time()
log.info('epoch: %d, batch: %d, loss: %f, recall: %f, precision: %f, all_acc: %f, avg time: %f, avg read time: %f'% \
(epoch, batch, outs[0], outs[1], outs[2], outs[3], (end-start)/100.0, batch_time/batch))
start = time.time()
else:
exe.run(train_program, feed=batch_feed_dict)
if batch%100==0:
recall = run_val(val_iter, exe, test_program, 'eval', fetch=fetch_list)
if recall > best_recall:
best_recall = recall
fluid.io.save_persistables(exe, './checkpoint', train_program)
start = time.time()
start_batch = time.time()
```python
可以正常训练但是在run_val的时候报错paddle.fluid.core_avx.EnforceNotMet: Invoke operator lookup_table error.
以及
C++ Callstacks:
holder_ should not be null
Tensor not initialized yet when Tensor::type() is called. at [/paddle/paddle/fluid/framework/tensor.h:139]
[任务链接] (http://10.73.201.14:8910/fileview.html?type=logsdir&path=/&instance=5.app-user-20190911150613-6320--yinshuo01_test_paddle)
谢谢!