From dc0702fe12e82fe37f8229fb1aec039e53bb68fd Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Thu, 31 Mar 2022 15:22:12 +0800
Subject: [PATCH] Using DistConfig in Paddle Inference (#41128)

* Pass compat of conv_transpose_bias_mkldnn_fuse_pass

* Fix a bug of strided_slice op, about the axes parameter access memory out of bounds

* Fix a bug of strided_slice op, about the axes parameter access memory out of bounds

* Fix a bug of transpose op, about accessing memory out of bounds of the perm param

* op:transpose_op supports bool type

* op:transpose_op supports bool type

* Keep strided_slice op behavior consistent with slice op when starts input is less than -rank

* Using DistConfig in inference
---
 .../fluid/distributed/ps/table/CMakeLists.txt |   4 +-
 .../distributed/ps/table/tensor_table.cc      | 107 +---------------
 .../fluid/distributed/ps/table/tensor_table.h | 117 +++++++++++++++---
 paddle/fluid/inference/CMakeLists.txt         |   2 +-
 .../inference/api/paddle_inference_api.h      |   1 +
 5 files changed, 107 insertions(+), 124 deletions(-)

diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index af4cad035e2..227d0a9f1cd 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -34,10 +34,9 @@ ${RPC_DEPS} graph_edge graph_node device_context string_helper
 simple_threadpool xxhash generator ${EXTERN_DEP})
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
-cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS})
+cc_library(tensor_table SRCS DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS})
 set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -54,6 +53,7 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
+
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.cc b/paddle/fluid/distributed/ps/table/tensor_table.cc
index dfe778fa61e..187c7021d02 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_table.cc
@@ -16,110 +16,5 @@
 
 DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {
-namespace distributed {
-
-int32_t TensorTable::set_program_env(
-    framework::Scope *scope, platform::Place place,
-    const std::vector<framework::ProgramDesc> *sub_program) {
-  scope_ = scope;
-  place_ = place;
-  executor_ = new framework::Executor(place_);
-  sub_program_ = sub_program;
-  return 0;
-}
-
-int32_t GlobalStepTable::initialize() {
-  auto _program_config = _config.tensor();
-  auto trainers_ = _config.common().trainer_num();
-  FLAGS_eager_delete_tensor_gb = -1;
-  // Get Config
-  if (_program_config.has_startup_program_id()) {
-    startup_program_id_ = _program_config.startup_program_id();
-  }
-  if (_program_config.has_main_program_id()) {
-    main_program_id_ = _program_config.main_program_id();
-  }
-  if (_program_config.has_feed_var_name()) {
-    feed_var_name_ = _program_config.feed_var_name();
-  }
-  if (_program_config.has_fetch_var_name()) {
-    fetch_var_name_ = _program_config.fetch_var_name();
-  }
-
-  // Run startup program
-  if (startup_program_id_ != -1) {
-    std::map<std::string, const framework::LoDTensor *> fake_feed;
-    std::map<std::string, framework::FetchType *> fake_fetch;
-    auto startup_program_desc = sub_program_->at(startup_program_id_);
-    auto ctx = executor_->Prepare(startup_program_desc, 0);
-    executor_->RunPreparedContext(ctx.get(), scope_, false);
-  }
-
-  if (main_program_id_ != -1) {
-    // Run main porgram, if program is used for learning decay
-    auto main_program_desc = sub_program_->at(main_program_id_);
-    auto main_ctx = executor_->Prepare(main_program_desc, 0);
-    exec_context_ = std::move(main_ctx);
-    executor_->RunPreparedContext(exec_context_.get(), scope_, false);
-    // init decay_counters
-    decay_counters_.reserve(trainers_);
-    for (int32_t i = 0; i < trainers_; ++i) {
-      decay_counters_[i] = 0;
-    }
-  }
-
-  return 0;
-}
-
-int32_t GlobalStepTable::set_table_map(
-    std::unordered_map<uint32_t, std::shared_ptr<Table>> *table_map) {
-  auto *lr_var = scope_->FindVar(fetch_var_name_);
-  auto *lr_tensor = lr_var->GetMutable<framework::LoDTensor>();
-  auto *lr_value = lr_tensor->mutable_data<float>(platform::CPUPlace());
-  VLOG(3) << "GlobalStepTable::set_table_map set global lr: " << *lr_value;
-
-  for (auto iter = table_map->begin(); iter != table_map->end(); iter++) {
-    auto table_id = iter->first;
-    if (table_id == _config.table_id()) {
-      continue;
-    }
-    iter->second->set_global_lr(lr_value);
-  }
-  return 0;
-}
-
-int32_t GlobalStepTable::push_dense(const int64_t *values,
-                                    const int32_t trainer_id) {
-  return _run_program(values, trainer_id);
-}
-
-int32_t GlobalStepTable::_run_program(const int64_t *values,
-                                      const uint32_t trainer_id) {
-  FLAGS_eager_delete_tensor_gb = -1;
-  auto counter = decay_counters_.at(trainer_id);
-  counter += int(values[0]);
-  decay_counters_.at(trainer_id) = counter;
-
-  auto *global_step_var = scope_->FindVar(feed_var_name_);
-  auto *tensor = global_step_var->GetMutable<framework::LoDTensor>();
-  auto *value = tensor->mutable_data<int64_t>(platform::CPUPlace());
-
-  auto global_counter = 0;
-  for (auto &trainer_counter : decay_counters_) {
-    global_counter += trainer_counter.second;
-  }
-
-  // Todo: hard code for increment op
-  value[0] = global_counter - 1;
-  VLOG(3) << "GlobalStepTable::_run_program global_counter " << value[0];
-
-  executor_->RunPreparedContext(exec_context_.get(), scope_, false, false);
-  auto *lr_var = scope_->FindVar(fetch_var_name_);
-  auto *lr_tensor = lr_var->GetMutable<framework::LoDTensor>();
-  auto *lr_value = lr_tensor->mutable_data<float>(platform::CPUPlace());
-  VLOG(3) << "GlobalStepTable::LR value: " << lr_value[0];
-  return 0;
-}
-
-}  // namespace distributed
+namespace distributed {}  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
index 23a62365c0f..e59314923cd 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -37,6 +37,8 @@ struct ExecutorPrepareContext;
 }  // namespace framework
 }  // namespace paddle
 
+DECLARE_double(eager_delete_tensor_gb);
+
 namespace paddle {
 namespace distributed {
 
@@ -66,9 +68,9 @@ class TensorTable : public Table {
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
 
-  virtual int32_t initialize_shard() { return 0; };
+  virtual int32_t initialize_shard() { return 0; }
 
-  virtual int32_t flush() { return 0; };
+  virtual int32_t flush() { return 0; }
 
   virtual int32_t load(const std::string &path, const std::string &param) {
     return 0;
@@ -77,18 +79,23 @@ class TensorTable : public Table {
     return 0;
   }
 
-  virtual void clear(){};
+  virtual void clear() {}
 
-  virtual int32_t initialize() override { return 0; };
+  int32_t initialize() override { return 0; }
 
-  virtual int32_t push_dense(const int64_t *values,
-                             const int32_t trainer_id) override {
+  int32_t push_dense(const int64_t *values, const int32_t trainer_id) override {
     return 0;
-  };
+  }
 
-  virtual int32_t set_program_env(
+  int32_t set_program_env(
       framework::Scope *scope, platform::Place place,
-      const std::vector<framework::ProgramDesc> *sub_program) override;
+      const std::vector<framework::ProgramDesc> *sub_program) override {
+    scope_ = scope;
+    place_ = place;
+    executor_ = new framework::Executor(place_);
+    sub_program_ = sub_program;
+    return 0;
+  }
 
  protected:
   framework::Executor *executor_;
@@ -135,7 +142,7 @@ class DenseTensorTable : public TensorTable {
 
   /*----------------------------------------------------------------------*/
 
-  virtual int32_t initialize() override { return 0; }
+  int32_t initialize() override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
@@ -189,18 +196,98 @@ class GlobalStepTable : public DenseTensorTable {
 
   /*----------------------------------------------------------------------*/
 
-  int32_t initialize() override;
+  int32_t initialize() override {
+    auto _program_config = _config.tensor();
+    auto trainers_ = _config.common().trainer_num();
+    FLAGS_eager_delete_tensor_gb = -1;
+    // Get Config
+    if (_program_config.has_startup_program_id()) {
+      startup_program_id_ = _program_config.startup_program_id();
+    }
+    if (_program_config.has_main_program_id()) {
+      main_program_id_ = _program_config.main_program_id();
+    }
+    if (_program_config.has_feed_var_name()) {
+      feed_var_name_ = _program_config.feed_var_name();
+    }
+    if (_program_config.has_fetch_var_name()) {
+      fetch_var_name_ = _program_config.fetch_var_name();
+    }
+
+    // Run startup program
+    if (startup_program_id_ != -1) {
+      std::map<std::string, const framework::LoDTensor *> fake_feed;
+      std::map<std::string, framework::FetchType *> fake_fetch;
+      auto startup_program_desc = sub_program_->at(startup_program_id_);
+      auto ctx = executor_->Prepare(startup_program_desc, 0);
+      executor_->RunPreparedContext(ctx.get(), scope_, false);
+    }
+
+    if (main_program_id_ != -1) {
+      // Run main porgram, if program is used for learning decay
+      auto main_program_desc = sub_program_->at(main_program_id_);
+      auto main_ctx = executor_->Prepare(main_program_desc, 0);
+      exec_context_ = std::move(main_ctx);
+      executor_->RunPreparedContext(exec_context_.get(), scope_, false);
+      // init decay_counters
+      decay_counters_.reserve(trainers_);
+      for (int32_t i = 0; i < trainers_; ++i) {
+        decay_counters_[i] = 0;
+      }
+    }
+  }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t push_dense(const int64_t *values, const int32_t trainer_id);
+  int32_t push_dense(const int64_t *values, const int32_t trainer_id) {
+    return _run_program(values, trainer_id);
+  }
 
-  int32_t set_table_map(
-      std::unordered_map<uint32_t, std::shared_ptr<Table>> *table_map) override;
+  int32_t set_table_map(std::unordered_map<uint32_t, std::shared_ptr<Table>>
+                            *table_map) override {
+    auto *lr_var = scope_->FindVar(fetch_var_name_);
+    auto *lr_tensor = lr_var->GetMutable<framework::LoDTensor>();
+    auto *lr_value = lr_tensor->mutable_data<float>(platform::CPUPlace());
+    VLOG(3) << "GlobalStepTable::set_table_map set global lr: " << *lr_value;
+
+    for (auto iter = table_map->begin(); iter != table_map->end(); iter++) {
+      auto table_id = iter->first;
+      if (table_id == _config.table_id()) {
+        continue;
+      }
+      iter->second->set_global_lr(lr_value);
+    }
+    return 0;
+  }
 
  private:
   virtual int32_t _run_program(const int64_t *values,
-                               const uint32_t trainer_id);
+                               const uint32_t trainer_id) {
+    FLAGS_eager_delete_tensor_gb = -1;
+    auto counter = decay_counters_.at(trainer_id);
+    counter += int(values[0]);
+    decay_counters_.at(trainer_id) = counter;
+
+    auto *global_step_var = scope_->FindVar(feed_var_name_);
+    auto *tensor = global_step_var->GetMutable<framework::LoDTensor>();
+    auto *value = tensor->mutable_data<int64_t>(platform::CPUPlace());
+
+    auto global_counter = 0;
+    for (auto &trainer_counter : decay_counters_) {
+      global_counter += trainer_counter.second;
+    }
+
+    // Todo: hard code for increment op
+    value[0] = global_counter - 1;
+    VLOG(3) << "GlobalStepTable::_run_program global_counter " << value[0];
+
+    executor_->RunPreparedContext(exec_context_.get(), scope_, false, false);
+    auto *lr_var = scope_->FindVar(fetch_var_name_);
+    auto *lr_tensor = lr_var->GetMutable<framework::LoDTensor>();
+    auto *lr_value = lr_tensor->mutable_data<float>(platform::CPUPlace());
+    VLOG(3) << "GlobalStepTable::LR value: " << lr_value[0];
+    return 0;
+  }
 
  private:
   std::unordered_map<int, int64_t> decay_counters_;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 5d0c3c98d2f..8cc4260289a 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -93,7 +93,7 @@ if (WITH_CRYPTO)
 endif (WITH_CRYPTO)
 
 if (WITH_PSCORE)
-    set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service)
+    set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service tensor_table)
 endif ()
 
 if (WITH_ONNXRUNTIME)
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 65906a57f46..58ccd79d84d 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -45,6 +45,7 @@ namespace paddle_infer {
 
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
+using DistConfig = paddle::DistConfig;
 
 ///
 /// \class Predictor
-- 
GitLab