From 4d649893ecf95042c18c898264d4d4a200a8a7f1 Mon Sep 17 00:00:00 2001 From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com> Date: Fri, 17 Jun 2022 14:48:57 +0800 Subject: [PATCH] bug fix (#43526) * back fl * delete ssl cert * . * make warning * . * unittest paral degree * solve unittest * heter & multi cloud commm ready * . * . * fl-ps v1.0 * . * support N + N mode * . * . * . * . * delete print * . * . * . * . * fix bug * . * . --- paddle/fluid/framework/heter_pipeline_trainer.cc | 2 +- .../paddle/distributed/passes/ps_trainer_pass.py | 4 ++-- python/paddle/distributed/ps/the_one_ps.py | 16 +++------------- 3 files changed, 6 insertions(+), 16 deletions(-) mode change 100644 => 100755 paddle/fluid/framework/heter_pipeline_trainer.cc diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc old mode 100644 new mode 100755 index dc99885811..98860cfbb0 --- a/paddle/fluid/framework/heter_pipeline_trainer.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer.cc @@ -333,5 +333,5 @@ Scope* HeterPipelineTrainer::GetWorkerScope(int thread_id) { } } // end namespace framework -} // namespace paddle +} // end namespace paddle #endif diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index 4a015fea30..80012e7428 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -434,8 +434,8 @@ class DistributedOpsPass(PassBase): if op.type in SPARSE_OP_TYPE_DICT.keys() \ and op.attr('remote_prefetch') is True: param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0] - if attrs['is_heter_ps_mode']: - # trick for matchnet, need to modify + if attrs['is_heter_ps_mode'] and not attrs['is_fl_ps_mode']: + # TODO: trick for matchnet, need to modify for heter_ps param_name += op.input("Ids")[0][0] ops = pull_sparse_ops.get(param_name, []) ops.append(op) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 0836e91c30..a199901011 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1015,14 +1015,8 @@ class TheOnePSRuntime(RuntimeBase): is_test = bool(int(os.getenv("TEST_MODE", "0"))) - # for GEO - if self.role_maker._is_first_worker() and self.is_heter_ps_mode: - # for ps-heter mode load all parameters on first_worker - init_params = get_the_one_recv_context(self.context, - split_dense_table=True, - use_origin_program=True) - else: - init_params = dense_map + # for GEO & heter_ps + init_params = dense_map # if not is_test: # self._communicator.init_params(init_params) @@ -1053,11 +1047,7 @@ class TheOnePSRuntime(RuntimeBase): fleet.util.barrier() # 保证 0 号 worker 参数 push_dense_param over if not self.context['use_ps_gpu']: - if self.is_heter_ps_mode == True and not self.role_maker._is_first_worker( - ): - self._communicator.pull_dense(init_params) - else: - self._pull_all_dense(scopes, send_ctx, dense_map) + self._pull_all_dense(scopes, send_ctx, dense_map) fleet.util.barrier() if self.context[ -- GitLab