未验证 提交 2fdbc1ce 编写于 作者: Y Yancey 提交者: GitHub

hidden bcast_params call in dist train (#11575)

上级 b94f7848
...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
break break
else: else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
if args.use_reader_op: if args.use_reader_op:
num_samples += args.batch_size * args.gpus num_samples += args.batch_size * args.gpus
else: else:
......
...@@ -71,7 +71,6 @@ class ParallelExecutor(object): ...@@ -71,7 +71,6 @@ class ParallelExecutor(object):
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
**kwargs): **kwargs):
if len(kwargs) != 0: if len(kwargs) != 0:
err_msg = "" err_msg = ""
for key in kwargs: for key in kwargs:
...@@ -130,6 +129,11 @@ class ParallelExecutor(object): ...@@ -130,6 +129,11 @@ class ParallelExecutor(object):
main = main_program main = main_program
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [
op.type for op in main.global_block().ops
] else False
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor): ParallelExecutor):
...@@ -262,6 +266,10 @@ class ParallelExecutor(object): ...@@ -262,6 +266,10 @@ class ParallelExecutor(object):
fetch_var_name = '@FETCHED_VAR_NAME@' fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name) self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self.bcast_params()
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def bcast_params(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册