[CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py (#34933)

* [CPU-PSLIB] Fix bug for consistency insepection of op's embedding name and sparse table name in config_fleet.py (#34441) * [CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py

[CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py (#34933)
* [CPU-PSLIB] Fix bug for consistency insepection of op's embedding name and sparse table name in config_fleet.py (#34441) * [CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py
79f92509 · Fan Zhang · GitHub · 61c121cd · 79f92509 · 79f92509
9 changed file
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -148,6 +148,7 @@ class DeviceWorker {
  FetchConfig fetch_config_;
  bool use_cvm_;
  bool no_cvm_;
+  bool scale_sparse_gradient_with_batch_size_;
  std::vector<std::string> all_param_;
 };

--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -78,6 +78,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
  use_cvm_ = desc.use_cvm();
  // for sparse value accessor, embedding only
  no_cvm_ = desc.no_cvm();
+  scale_sparse_gradient_with_batch_size_ =
+      desc.scale_sparse_gradient_with_batch_size();
  scale_datanorm_ = desc.scale_datanorm();
  dump_slot_ = desc.dump_slot();
  dump_fields_.resize(desc.dump_fields_size());
@@ -614,7 +616,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
            *thread_scope_, tid, features_[tid], feature_labels_[tid],
            sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
        timeline.Pause();
        push_sparse_time += timeline.ElapsedSec();
        total_time += timeline.ElapsedSec();
@@ -887,7 +890,8 @@ void DownpourWorker::TrainFiles() {
            *thread_scope_, tid, features_[tid], feature_labels_[tid],
            sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
      }
    }

--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -464,11 +464,13 @@ void DownpourWorkerOpt::TrainFiles() {
            break;
          }
        }
+        bool scale_sparse_gradient_with_batch_size_ = true;
        fleet_ptr_->PushSparseVarsWithLabelAsync(
            *thread_scope_, tid, features_[tid], feature_labels_[tid],
            sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
      }
    }

--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -531,7 +531,8 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
    std::vector<std::vector<float>>* push_values,
    std::vector<::std::future<int32_t>>* push_sparse_status,
    const int batch_size, const bool use_cvm, const bool dump_slot,
-    std::vector<uint64_t>* sparse_push_keys, const bool no_cvm) {
+    std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
+    const bool scale_sparse_gradient_with_batch_size) {
 #ifdef PADDLE_WITH_PSLIB
  int offset = 2;
  int slot_offset = 0;
@@ -595,7 +596,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
    }
    float* g = g_tensor->data<float>();
-    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
+    if (scale_sparse_gradient_with_batch_size && grad_dim > 0) {
      int dim = emb_dim;
      Eigen::Map<
          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>

--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -164,7 +164,8 @@ class FleetWrapper {
      std::vector<std::vector<float>>* push_values,
      std::vector<::std::future<int32_t>>* push_sparse_status,
      const int batch_size, const bool use_cvm, const bool dump_slot,
-      std::vector<uint64_t>* sparse_push_keys, const bool no_cvm);
+      std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
+      const bool scale_sparse_gradient_with_batch_size);
  // Push sparse variables to server in async mode
  void PushSparseFromTensorWithLabelAsync(

--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -51,6 +51,7 @@ message TrainerDesc {
  repeated string loss_names = 23;
  optional string user_define_dump_filename = 24;
+  optional bool scale_sparse_gradient_with_batch_size = 25 [ default = true ];
  // device worker parameters
  optional HogwildWorkerParameter hogwild_param = 101;

--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -593,6 +593,8 @@ class DistributedAdam(DistributedOptimizerImplBase):
        opt_info["worker_skipped_ops"] = worker_skipped_ops
        opt_info["use_cvm"] = strategy.get("use_cvm", False)
        opt_info["no_cvm"] = strategy.get("no_cvm", False)
+        opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get(
+            "scale_sparse_gradient_with_batch_size", True)
        opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
        opt_info["local_tables"] = strategy.get("local_tables", [])
        opt_info["async_tables"] = strategy.get("async_tables", [])

--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -84,6 +84,10 @@ class TrainerDesc(object):
    def _set_no_cvm(self, no_cvm=False):
        self.proto_desc.no_cvm = no_cvm
+    def _set_scale_sparse_grad_with_batch_size(
+            self, scale_sparse_gradient_with_batch_size=True):
+        self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size
    def _set_scale_datanorm(self, scale_datanorm=-1):
        self.proto_desc.scale_datanorm = scale_datanorm

--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -80,6 +80,10 @@ class TrainerFactory(object):
                    trainer._set_use_cvm(opt_info["use_cvm"])
                if opt_info.get("no_cvm") is not None:
                    trainer._set_no_cvm(opt_info["no_cvm"])
+                if opt_info.get(
+                        "scale_sparse_gradient_with_batch_size") is not None:
+                    trainer._set_scale_sparse_grad_with_batch_size(opt_info[
+                        "scale_sparse_gradient_with_batch_size"])
                if opt_info.get("scale_datanorm") is not None:
                    trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                if opt_info.get("adjust_ins_weight") is not None: