未验证 提交 2fdbc1ce 编写于 作者: Y Yancey 提交者: GitHub

hidden bcast_params call in dist train (#11575)

上级 b94f7848
......@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
break
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
......
......@@ -71,7 +71,6 @@ class ParallelExecutor(object):
num_trainers=1,
trainer_id=0,
**kwargs):
if len(kwargs) != 0:
err_msg = ""
for key in kwargs:
......@@ -130,6 +129,11 @@ class ParallelExecutor(object):
main = main_program
main = main if main else framework.default_main_program()
scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [
op.type for op in main.global_block().ops
] else False
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
......@@ -262,6 +266,10 @@ class ParallelExecutor(object):
fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self.bcast_params()
return [arr[i] for i in range(len(arr))]
def bcast_params(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册