GES Example,训练提前break后程序会hang住无法正常退出
Created by: minhozhou
环境:python3、最新pgl、paddle1.6、gpu环境 和example中的ges代码变动如下,就是step=5的时候提前break,程序会卡住:
def train(train_exe, exe, program, loss, node2vec_pyreader, args, train_steps):
""" train
"""
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
step = 0
while True:
try:
begin_time = time.time()
loss_val, = train_exe.run(fetch_list=[loss])
log.info("step %s: loss %.5f speed: %.5f s/step" %
(step, np.mean(loss_val), time.time() - begin_time))
step += 1
if step == 5:
break
except F.core.EOFException:
node2vec_pyreader.reset()
if (step % args.steps_per_save == 0 or
step == train_steps) and trainer_id == 0:
model_save_dir = args.output_path
model_path = os.path.join(model_save_dir, str(step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
F.io.save_params(exe, model_path, program)
if step == train_steps:
break