From 79f92509a9f9d98f1dd9e68c5683728fb1c72107 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Tue, 17 Aug 2021 13:08:53 +0800 Subject: [PATCH] [CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py (#34933) * [CPU-PSLIB] Fix bug for consistency insepection of op's embedding name and sparse table name in config_fleet.py (#34441) * [CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py --- paddle/fluid/framework/device_worker.h | 1 + paddle/fluid/framework/downpour_worker.cc | 8 ++++++-- paddle/fluid/framework/downpour_worker_opt.cc | 4 +++- paddle/fluid/framework/fleet/fleet_wrapper.cc | 5 +++-- paddle/fluid/framework/fleet/fleet_wrapper.h | 3 ++- paddle/fluid/framework/trainer_desc.proto | 1 + .../fleet/parameter_server/pslib/optimizer_factory.py | 2 ++ python/paddle/fluid/trainer_desc.py | 4 ++++ python/paddle/fluid/trainer_factory.py | 4 ++++ 9 files changed, 26 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 8c6c729a33a..4e640b09675 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -148,6 +148,7 @@ class DeviceWorker { FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; + bool scale_sparse_gradient_with_batch_size_; std::vector all_param_; }; diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index c36f010a941..9e04cca0935 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -78,6 +78,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { use_cvm_ = desc.use_cvm(); // for sparse value accessor, embedding only no_cvm_ = desc.no_cvm(); + scale_sparse_gradient_with_batch_size_ = + desc.scale_sparse_gradient_with_batch_size(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); dump_fields_.resize(desc.dump_fields_size()); @@ -614,7 +616,8 @@ void DownpourWorker::TrainFilesWithProfiler() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); timeline.Pause(); push_sparse_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); @@ -887,7 +890,8 @@ void DownpourWorker::TrainFiles() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc index 79f80a373a2..8821edb1abe 100644 --- a/paddle/fluid/framework/downpour_worker_opt.cc +++ b/paddle/fluid/framework/downpour_worker_opt.cc @@ -464,11 +464,13 @@ void DownpourWorkerOpt::TrainFiles() { break; } } + bool scale_sparse_gradient_with_batch_size_ = true; fleet_ptr_->PushSparseVarsWithLabelAsync( *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 6287c9d2acc..e20ffd06ef8 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -531,7 +531,8 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm) { + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size) { #ifdef PADDLE_WITH_PSLIB int offset = 2; int slot_offset = 0; @@ -595,7 +596,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( } float* g = g_tensor->data(); - if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { + if (scale_sparse_gradient_with_batch_size && grad_dim > 0) { int dim = emb_dim; Eigen::Map< Eigen::Matrix> diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 9d60beb7fd8..d3732c3029e 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -164,7 +164,8 @@ class FleetWrapper { std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm); + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size); // Push sparse variables to server in async mode void PushSparseFromTensorWithLabelAsync( diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index b38572681dc..f18edcc7fff 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -51,6 +51,7 @@ message TrainerDesc { repeated string loss_names = 23; optional string user_define_dump_filename = 24; + optional bool scale_sparse_gradient_with_batch_size = 25 [ default = true ]; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 311c6271f2f..1e750031f33 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -593,6 +593,8 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["worker_skipped_ops"] = worker_skipped_ops opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["no_cvm"] = strategy.get("no_cvm", False) + opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get( + "scale_sparse_gradient_with_batch_size", True) opt_info["stat_var_names"] = strategy.get("stat_var_names", []) opt_info["local_tables"] = strategy.get("local_tables", []) opt_info["async_tables"] = strategy.get("async_tables", []) diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 2dda17072bf..6c4621ac39b 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -84,6 +84,10 @@ class TrainerDesc(object): def _set_no_cvm(self, no_cvm=False): self.proto_desc.no_cvm = no_cvm + def _set_scale_sparse_grad_with_batch_size( + self, scale_sparse_gradient_with_batch_size=True): + self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size + def _set_scale_datanorm(self, scale_datanorm=-1): self.proto_desc.scale_datanorm = scale_datanorm diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index f0bc4a90292..ec6d10a37b8 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -80,6 +80,10 @@ class TrainerFactory(object): trainer._set_use_cvm(opt_info["use_cvm"]) if opt_info.get("no_cvm") is not None: trainer._set_no_cvm(opt_info["no_cvm"]) + if opt_info.get( + "scale_sparse_gradient_with_batch_size") is not None: + trainer._set_scale_sparse_grad_with_batch_size(opt_info[ + "scale_sparse_gradient_with_batch_size"]) if opt_info.get("scale_datanorm") is not None: trainer._set_scale_datanorm(opt_info["scale_datanorm"]) if opt_info.get("adjust_ins_weight") is not None: -- GitLab