From 903235945b20b27b1b9f4aa04b7f2e3ab5fa0b43 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Fri, 26 Feb 2021 11:53:03 +0800 Subject: [PATCH] loglevel adjustment for distributed training (#31205) Change-Id: I6210ce9c60bed48f3323c47b16500302b66cedf2 --- paddle/fluid/distributed/fleet.cc | 2 +- .../fluid/distributed/service/brpc_ps_server.cc | 7 ++----- paddle/fluid/distributed/service/communicator.cc | 2 +- paddle/fluid/distributed/service/communicator.h | 10 +++++----- paddle/fluid/distributed/service/heter_client.cc | 8 ++++---- paddle/fluid/distributed/service/heter_server.cc | 2 +- paddle/fluid/distributed/service/heter_server.h | 2 +- .../distributed/table/common_dense_table.cc | 2 +- .../fluid/distributed/table/common_dense_table.h | 7 ++----- .../distributed/table/common_sparse_table.cc | 16 ++++++++-------- 10 files changed, 26 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index f4fdf4880bc..b638af49730 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -501,7 +501,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, if (name.find("batch_sum") != std::string::npos) { Variable* var = scope->FindVar(name); CHECK(var != nullptr) << "var[" << name << "] not found"; - VLOG(0) << "prepare shrink dense batch_sum"; + VLOG(3) << "prepare shrink dense batch_sum"; LoDTensor* tensor = var->GetMutable(); float* g = tensor->data(); diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index 32de1184738..8400e669182 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -79,16 +79,13 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) { } } - VLOG(0) << "BrpcPsServer::start registe_ps_server"; _environment->registe_ps_server(ip, port, _rank); - VLOG(0) << "BrpcPsServer::start wait"; cv_.wait(lock, [&] { return stoped_; }); PSHost host; host.ip = ip; host.port = port; host.rank = _rank; - VLOG(0) << "BrpcPsServer::start return host.rank"; return host.rank; } @@ -464,7 +461,7 @@ int32_t BrpcPsService::save_one_table(Table *table, int32_t feasign_size = 0; - VLOG(0) << "save one table " << request.params(0) << " " << request.params(1); + VLOG(3) << "save table " << request.params(0) << " " << request.params(1); feasign_size = table->save(request.params(0), request.params(1)); if (feasign_size < 0) { set_response_code(response, -1, "table save failed"); @@ -507,7 +504,7 @@ int32_t BrpcPsService::shrink_table(Table *table, set_response_code(response, -1, "table shrink failed"); return -1; } - VLOG(0) << "Pserver Shrink Finished"; + VLOG(3) << "Pserver Shrink Finished"; return 0; } diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc index aea758a717b..8699719e5cd 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/service/communicator.cc @@ -39,7 +39,7 @@ inline double GetCurrentUS() { Communicator::Communicator() {} void Communicator::init_gflag(const std::string &gflags) { - VLOG(0) << "Init With Gflags:" << gflags; + VLOG(3) << "Init With Gflags:" << gflags; std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h index fd53e0e4f4a..043fe9d83df 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/service/communicator.h @@ -199,10 +199,10 @@ class Communicator { Communicator(); explicit Communicator(const std::map &envs_) { - VLOG(0) << "Communicator Init Envs"; + VLOG(3) << "Communicator Init Envs"; for (auto &iter : envs_) { envs[iter.first] = iter.second; - VLOG(0) << iter.first << ": " << iter.second; + VLOG(3) << iter.first << ": " << iter.second; } barrier_table_id_ = std::stoi(envs.at("barrier_table_id")); trainer_id_ = std::stoi(envs.at("trainer_id")); @@ -436,7 +436,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator { need_global_step_ = static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "HalfAsyncCommunicator Initialized"; + VLOG(1) << "HalfAsyncCommunicator Initialized"; } void MainThread() override; @@ -481,7 +481,7 @@ class SyncCommunicator : public HalfAsyncCommunicator { need_global_step_ = static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "SyncCommunicator Initialized"; + VLOG(1) << "SyncCommunicator Initialized"; } void BarrierSend(); @@ -525,7 +525,7 @@ class GeoCommunicator : public AsyncCommunicator { // id_queue's size max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); send_queue_size_ = max_merge_var_num_; - VLOG(0) << "GeoCommunicator Initialized"; + VLOG(1) << "GeoCommunicator Initialized"; } void Send(const std::vector &var_names, diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc index b8354971495..10fc8368a26 100644 --- a/paddle/fluid/distributed/service/heter_client.cc +++ b/paddle/fluid/distributed/service/heter_client.cc @@ -34,7 +34,7 @@ void HeterClient::MainThread() { void HeterClient::Stop() { running_ = false; if (!is_initialized_) { - VLOG(0) << "HeterClient is not inited, do nothing"; + VLOG(3) << "HeterClient is not inited, do nothing"; } else { if (main_thread_) { auto status = StopHeterWorker(); @@ -42,20 +42,20 @@ void HeterClient::Stop() { main_thread_->join(); main_thread_.reset(nullptr); } - VLOG(1) << "HeterClient Stop Done"; + VLOG(3) << "HeterClient Stop Done"; } } void HeterClient::FinalizeWorker() { running_ = false; if (!is_initialized_) { - VLOG(0) << "HeterClient is not inited, do nothing"; + VLOG(3) << "HeterClient is not inited, do nothing"; } else { if (main_thread_) { main_thread_->join(); main_thread_.reset(nullptr); } - VLOG(1) << "HeterClient Stop Done"; + VLOG(3) << "HeterClient Stop Done"; } } diff --git a/paddle/fluid/distributed/service/heter_server.cc b/paddle/fluid/distributed/service/heter_server.cc index 7e0ac8ecf35..57a1a16a723 100644 --- a/paddle/fluid/distributed/service/heter_server.cc +++ b/paddle/fluid/distributed/service/heter_server.cc @@ -89,7 +89,7 @@ int32_t HeterService::stop_heter_worker(const PsRequestMessage& request, stop_cpu_worker_set_.insert(client_id); if (stop_cpu_worker_set_.size() == fan_in_) { is_exit_ = true; - VLOG(0) << "Stop heter Service done."; + VLOG(3) << "Stop heter Service done."; } return 0; } diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h index 5d967ae06d8..93fa37454a5 100644 --- a/paddle/fluid/distributed/service/heter_server.h +++ b/paddle/fluid/distributed/service/heter_server.h @@ -153,7 +153,7 @@ class HeterServer { virtual ~HeterServer() {} void Stop() { - VLOG(0) << "HeterServer Stop()"; + VLOG(3) << "HeterServer Stop()"; std::unique_lock lock(mutex_); stoped_ = true; cv_.notify_all(); diff --git a/paddle/fluid/distributed/table/common_dense_table.cc b/paddle/fluid/distributed/table/common_dense_table.cc index 4063e4f501d..87a9f5fb242 100644 --- a/paddle/fluid/distributed/table/common_dense_table.cc +++ b/paddle/fluid/distributed/table/common_dense_table.cc @@ -94,7 +94,7 @@ int32_t CommonDenseTable::initialize_optimizer() { } else { VLOG(0) << "init optimizer failed"; } - VLOG(0) << "init optimizer " << name << " done"; + VLOG(3) << "init optimizer " << name << " done"; return 0; } diff --git a/paddle/fluid/distributed/table/common_dense_table.h b/paddle/fluid/distributed/table/common_dense_table.h index e363afc45c5..74366f03588 100644 --- a/paddle/fluid/distributed/table/common_dense_table.h +++ b/paddle/fluid/distributed/table/common_dense_table.h @@ -47,15 +47,12 @@ class CommonDenseTable : public DenseTable { virtual int32_t set_global_lr(float* lr) override; int32_t load(const std::string& path, const std::string& param) override { - VLOG(0) << "Dense table may load by " - "paddle.distributed.fleet.init_server"; + VLOG(0) << "WARNING: dense variables will load on No.0 trainer"; return 0; } int32_t save(const std::string& path, const std::string& param) override { - VLOG(0) - << "Dense table may be saved by " - "paddle.distributed.fleet.save_persistables/save_inference_model"; + VLOG(0) << "WARNING: dense variables will save on No.0 trainer"; return 0; } diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index 9155bb7c206..ffedbea14a0 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -170,7 +170,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, auto id = std::stoull(values[0]); if (id % pserver_num != pserver_id) { - VLOG(0) << "will not load " << values[0] << " from " << valuepath + VLOG(3) << "will not load " << values[0] << " from " << valuepath << ", please check id distribution"; continue; } @@ -263,7 +263,7 @@ int32_t CommonSparseTable::initialize_value() { } } - VLOG(0) << "has " << feasigns.size() << " ids need to be pre inited"; + VLOG(3) << "has " << feasigns.size() << " ids need to be pre inited"; auto buckets = bucket(feasigns.size(), 10); for (int x = 0; x < 10; ++x) { @@ -295,10 +295,10 @@ int32_t CommonSparseTable::initialize_optimizer() { optimizer_ = std::make_shared(value_names_, value_dims_, value_offsets_, value_idx_); } else { - VLOG(0) << "init optimizer failed"; + VLOG(3) << "init optimizer failed"; } - VLOG(0) << "init optimizer " << name << " done"; + VLOG(3) << "init optimizer " << name << " done"; return 0; } @@ -311,7 +311,7 @@ int32_t CommonSparseTable::set_global_lr(float* lr) { int32_t CommonSparseTable::load(const std::string& path, const std::string& param) { rwlock_->WRLock(); - VLOG(0) << "sparse table load with " << path << " with meta " << param; + VLOG(3) << "sparse table load with " << path << " with meta " << param; LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_, &shard_values_); rwlock_->UNLock(); @@ -322,7 +322,7 @@ int32_t CommonSparseTable::save(const std::string& dirname, const std::string& param) { rwlock_->WRLock(); int mode = std::stoi(param); - VLOG(0) << "sparse table save: " << dirname << " mode: " << mode; + VLOG(3) << "sparse table save: " << dirname << " mode: " << mode; auto varname = _config.common().table_name(); std::string var_store = @@ -538,11 +538,11 @@ int32_t CommonSparseTable::flush() { return 0; } int32_t CommonSparseTable::shrink(const std::string& param) { rwlock_->WRLock(); int threshold = std::stoi(param); - VLOG(0) << "sparse table shrink: " << threshold; + VLOG(3) << "sparse table shrink: " << threshold; for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { // shrink - VLOG(0) << shard_id << " " << task_pool_size_ << " begin shrink"; + VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink"; shard_values_[shard_id]->Shrink(threshold); } rwlock_->UNLock(); -- GitLab