From 70029395405733b102463de2792717b53ec69b54 Mon Sep 17 00:00:00 2001 From: ZPaC Date: Tue, 8 Sep 2020 20:28:14 +0800 Subject: [PATCH] 1.Fix error when pserver finishes training. 2.Optimize worker and server log. 3.Try Catch ps-lite exceptions. --- mindspore/ccsrc/frontend/parallel/ps/parameter_server.h | 4 ++++ mindspore/ccsrc/frontend/parallel/ps/worker.h | 6 +++++- model_zoo/official/recommend/wide_and_deep/src/callbacks.py | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h index 831c7243e..7290451a1 100644 --- a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h +++ b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h @@ -736,7 +736,9 @@ void ParameterServer::SyncEmbeddingTables() { template void ParameterServer::Run(const FuncGraphPtr &func_graph) { + MS_LOG(INFO) << "PServer starts connecting to scheduler and workers..."; ::ps::Start(0); + MS_LOG(INFO) << "PServer connected successfully."; if (!::ps::IsServer()) { std::cout << "This is not ther Server" << std::endl; return; @@ -744,7 +746,9 @@ void ParameterServer::Run(const FuncGraphPtr &func_graph) { Init(func_graph); PSContext::instance()->SetPSRankId(rank_id_); thread_->join(); + MS_LOG(INFO) << "PServer finished updating models, starts finalizing..."; ::ps::Finalize(0, true); + MS_LOG(INFO) << "PServer finalized successfully."; } } // namespace ps } // namespace parallel diff --git a/mindspore/ccsrc/frontend/parallel/ps/worker.h b/mindspore/ccsrc/frontend/parallel/ps/worker.h index 8a83674da..29b927511 100644 --- a/mindspore/ccsrc/frontend/parallel/ps/worker.h +++ b/mindspore/ccsrc/frontend/parallel/ps/worker.h @@ -86,7 +86,9 @@ void Worker::Run() { MS_LOG(INFO) << "'Worker is already running."; return; } + MS_LOG(INFO) << "Worker starts connecting to scheduler and server..."; ::ps::Start(0); + MS_LOG(INFO) << "Worker connected successfully."; if (!::ps::IsWorker()) { MS_LOG(EXCEPTION) << "The role is not worker."; } @@ -176,9 +178,11 @@ void Worker::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const : template void Worker::Finalize() { if (running_) { + MS_LOG(INFO) << "Worker starts finalizing..."; kv_worker_->Finalize(); kv_worker_.reset(); running_ = false; + MS_LOG(INFO) << "Worker finalized successfully."; } } @@ -315,7 +319,7 @@ void Worker::InitPSParamAndOptim(const std::string ¶m_name, tensor::Tenso size_t param_key = GetParamKey(param_name); if (param_key == kInvalidKey) { - MS_LOG(INFO) << "Parameter " << param_name << " has no key assigned."; + MS_LOG(DEBUG) << "Parameter " << param_name << " has no key assigned."; return; } bool init_in_server = false; diff --git a/model_zoo/official/recommend/wide_and_deep/src/callbacks.py b/model_zoo/official/recommend/wide_and_deep/src/callbacks.py index c48ac8a69..2e358a6cd 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/callbacks.py +++ b/model_zoo/official/recommend/wide_and_deep/src/callbacks.py @@ -36,6 +36,7 @@ class LossCallBack(Callback): Note: If per_print_times is 0, do NOT print loss. + If this process is MS_PSERVER role, do not run callbacks. Args: per_print_times (int): Print loss every times. Default: 1. @@ -50,6 +51,8 @@ class LossCallBack(Callback): def step_end(self, run_context): """Monitor the loss in training.""" cb_params = run_context.original_args() + if cb_params.net_outputs is None: + return wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy(), cb_params.net_outputs[1].asnumpy() cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 cur_num = cb_params.cur_step_num -- GitLab