add ps_instance doc

bd1c1724 · heqiaozhi · dongdaxiang · 35ce6ac2 · bd1c1724 · bd1c1724
4 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
-# windows treat symbolic file as a real file, which is different with unix
+#windows treat symbolic file as a real file, which is different with unix
-# We create a hidden file and compile it instead of origin source file.
+#We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
@@ -11,7 +11,7 @@ function(windows_symbolic TARGET)
      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
  endif()
-  # only copy the xx.cu to .xx.cu when the content are modified
+#only copy the xx.cu to.xx.cu when the content are modified
  set(copy_flag 1)
  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
@@ -32,7 +32,7 @@ endfunction()
 add_subdirectory(ir)
 add_subdirectory(details)
-# ddim lib
+#ddim lib
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
@@ -89,8 +89,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 if(WITH_GPU)
  if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
+#windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
+#We create a hidden file and compile it instead of origin source file.
      windows_symbolic(hidden_file SRCS data_type_transform.cu)
      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
      add_dependencies(data_type_transform hidden_file)
@@ -137,7 +137,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
-# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+#Generate an empty \
+    __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 if (NOT WIN32)

--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 #ifdef PADDLE_WITH_PSLIB
-#include "pslib.h"
+#include <pslib.h>
 #endif
 namespace paddle {
@@ -70,8 +70,7 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
 #ifdef PADDLE_WITH_PSLIB
 void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
-    _pslib_ptr =
+  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
-        std::shared_ptr<paddle::distributed::PSlib>(
      new paddle::distributed::PSlib());
  _pslib_ptr->init_server(dist_desc, index);
  InitParamConfig();
@@ -82,38 +81,41 @@ void AsyncExecutor::InitWorker(const std::string& dist_desc,
                               int node_num, int index) {
  _pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
      new paddle::distributed::PSlib());
-    _pslib_ptr->init_worker(
+  _pslib_ptr->init_worker(dist_desc,
-        dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index);
+                          static_cast<uint64_t*>(host_sign_list.data()),
+                          node_num, index);
  InitParamConfig();
 }
-uint64_t AsyncExecutor::StartServer() {
+uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); }
-    return _pslib_ptr->run_server();
-}
-void AsyncExecutor::StopServer() {
+void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); }
-    _pslib_ptr->stop_server();
-}
-void AsyncExecutor::GatherServers(
+void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
-    const std::vector<uint64_t>& host_sign_list, int node_num) {
+                                  int node_num) {
-    _pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num);
+  _pslib_ptr->gather_servers(static_cast<uint64_t*>(host_sign_list.data()),
+                             node_num);
 }
 void AsyncExecutor::InitParamConfig() {
-  for (int i = 0; i <
+  for (int i = 0; i < _pslib_ptr->get_param()
-               _pslib_ptr->get_param()->server_param(). \
+                          ->server_param()
-               downpour_server_param().                 \
+                          .downpour_server_param()
-               downpour_table_param_size();
+                          .downpour_table_param_size();
       ++i) {
-    if (_pslib_ptr->get_param()->server_param().                \
+    if (_pslib_ptr->get_param()
-        downpour_server_param().downpour_table_param(i).        \
+            ->server_param()
-        table_class().find("SparseTable") != -1) {
+            .downpour_server_param()
-      _param_config.fea_dim = _pslib_ptr->get_param()->server_param().  \
+            .downpour_table_param(i)
-                              downpour_server_param().                  \
+            .table_class()
-                              downpour_table_param(i).                  \
+            .find("SparseTable") != -1) {
-                              accessor().fea_dim();
+      _param_config.fea_dim = _pslib_ptr->get_param()
+                                  ->server_param()
+                                  .downpour_server_param()
+                                  .downpour_table_param(i)
+                                  .accessor()
+                                  .fea_dim();
      break;
    }
  }
@@ -123,27 +125,23 @@ void AsyncExecutor::InitParamConfig() {
  _param_config.tmp_push_sparse_wait_times = static_cast<int32_t>(
      _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch());
-  for (auto t = 0u;
+  for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
-       t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
       ++t) {
    _param_config.skip_op.push_back(
        _pslib_ptr->get_param()->trainer_param().skip_op(t));
  }
  for (auto t = 0u;
-       t < _pslib_ptr->get_param()->trainer_param().sparse_table_size();
+       t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) {
-       ++t) {
    auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t);
    std::vector<std::string> tmp_sparse_variable_name;
    for (int i = 0u; i < table.slot_value_size(); ++i) {
      tmp_sparse_variable_name.push_back(table.slot_value(i));
-      _param_config.slot_alias_to_table[table.slot_key(i)] =
+      _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id();
-          table.table_id();
    }
    std::vector<std::string> tmp_sparse_gradient_variable_name;
    for (auto i = 0u; i < table.slot_gradient_size(); ++i) {
-      tmp_sparse_gradient_variable_name.push_back(
+      tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i));
-          table.slot_gradient(i));
    }
    _param_config.slot_input_vec[table.table_id()] =
        std::move(tmp_sparse_variable_name);
@@ -153,8 +151,7 @@ void AsyncExecutor::InitParamConfig() {
  }
  for (auto t = 0u;
-       t < _pslib_ptr->get_param()->trainer_param().dense_table_size();
+       t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) {
-       ++t) {
    auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t);
    std::vector<std::string> tmp_dense_variable_name;
    for (int i = 0u; i < table.dense_variable_name_size(); ++i) {
@@ -198,8 +195,7 @@ void AsyncExecutor::InitModel() {
      regions.emplace_back(std::move(reg));
    }
-    auto push_status =
+    auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(
-        _pslib_ptr->_worker_ptr->push_dense_param(
        regions.data(), regions.size(), table_id);
    push_status.wait();
    auto status = push_status.get();
@@ -225,14 +221,14 @@ void AsyncExecutor::SaveModel(const std::string& path) {
 void AsyncExecutor::PrepareDenseThread(const std::string& mode) {
  if (mode == "mpi") {
    DensePullThreadParam param;
-    param.ps_client = _pslib_ptr->_worker_ptr;;
+    param.ps_client = _pslib_ptr->_worker_ptr;
    param.threshold = 1;
    param.training_thread_num = actual_thread_num;
    param.root_scope = root_scope_;
    param.dense_params = &_param_config.dense_variable_name;
-    _pull_dense_thread = std::shared_ptr<DensePullThread>(
+    _pull_dense_thread =
-        new DensePullThread(param));
+        std::shared_ptr<DensePullThread>(new DensePullThread(param));
    _pull_dense_thread->start();
  }
 }
@@ -243,8 +239,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
                                const std::vector<std::string>& filelist,
                                const int thread_num,
                                const std::vector<std::string>& fetch_var_names,
-                                const std::string& mode,
+                                const std::string& mode, const bool debug) {
-                                const bool debug) {
  std::vector<std::thread> threads;
  auto& block = main_program.Block(0);
@@ -308,7 +303,6 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
                  fetch_var_names, root_scope_, thidx, debug);
  }
  // start executing ops in multiple threads
  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
    threads.push_back(

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/executor_thread_worker.h"
+#include <algorithm>
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -95,8 +96,7 @@ void DensePullThread::wait_all() {
    t.wait();
    auto status = t.get();
    if (status != 0) {
-      LOG(WARNING) << "pull dense failed times:" <<
+      LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times;
-          ++_pull_dense_fail_times;
    }
  }
@@ -108,8 +108,8 @@ void DensePullThread::wait_all() {
  _pull_dense_status.resize(0);
 }
-void DensePullThread::increase_thread_version(
+void DensePullThread::increase_thread_version(int thread_id,
-    int thread_id, uint64_t table_id) {
+                                              uint64_t table_id) {
  std::lock_guard<std::mutex> lock(_mutex_for_version);
  _training_versions[table_id][thread_id]++;
 }
@@ -174,7 +174,6 @@ void ExecutorThreadWorker::SetFetchVarNames(
                          fetch_var_names.end());
 }
 void ExecutorThreadWorker::SetDevice() {
 #if defined _WIN32 || defined __APPLE__
  return;
@@ -351,8 +350,7 @@ void AsyncExecutorThreadWorker::TrainOneNetwork() {
    }
    bool need_skip = false;
    for (auto t = 0u; t < _param_config->skip_op.size(); ++t) {
-      if (op->Type().find(_param_config->skip_op[t]) !=
+      if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) {
-          std::string::npos) {
        need_skip = true;
        break;
      }
@@ -437,13 +435,12 @@ void AsyncExecutorThreadWorker::PushDense(int table_id) {
    regions.emplace_back(std::move(reg));
  }
-  auto status = _pslib_ptr->_worker_ptr->push_dense(
+  auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(),
-      regions.data(), regions.size(), table_id);
+                                                    regions.size(), table_id);
  _push_dense_status.push_back(std::move(status));
 }
 void AsyncExecutorThreadWorker::PullSparse(int table_id) {
  auto& features = _features[table_id];
  auto& feature_value = _feature_value[table_id];
  auto fea_dim = _param_config->fea_dim;
@@ -451,8 +448,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) {
  features.clear();
  features.resize(0);
  features.reserve(MAX_FEASIGN_NUM);
-  const std::vector<std::string>& feed_vec =
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-      thread_reader_->GetUseSlotAlias();
  // slot_idx = 0 is label TODO
  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
@@ -468,7 +464,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) {
      features.push_back(static_cast<uint64_t>(ids[i]));
    }
  }
-  check_pull_push_memory(features, feature_value, fea_dim);
+  check_pull_push_memory(features, &feature_value, fea_dim);
  std::vector<float*> pull_feature_value;
  for (auto i = 0u; i < features.size(); ++i) {
@@ -480,7 +476,7 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) {
  _pull_sparse_status.push_back(std::move(status));
  auto& push_g = _feature_push_value[table_id];
-  check_pull_push_memory(features, push_g, fea_dim);
+  check_pull_push_memory(features, &push_g, fea_dim);
  collect_feasign_info(table_id);
 }
@@ -497,8 +493,7 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) {
  std::vector<float> init_value(fea_dim);
-  const std::vector<std::string>& feed_vec =
+  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
-      thread_reader_->GetUseSlotAlias();
  // slot_idx = 0 is label TODO
  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
    Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]);
@@ -508,8 +503,8 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) {
    Variable* var_emb = thread_scope_->FindVar(
        _param_config->slot_input_vec[table_id][slot_idx - 1]);
    LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
-    float* ptr = tensor_emb->mutable_data<float>(
+    float* ptr =
-      {len, slot_dim}, platform::CPUPlace());
+        tensor_emb->mutable_data<float>({len, slot_dim}, platform::CPUPlace());
    memset(ptr, 0, sizeof(float) * len * slot_dim);
    auto& tensor_lod = tensor->lod()[0];
@@ -518,12 +513,12 @@ void AsyncExecutorThreadWorker::FillSparse(int table_id) {
    for (auto index = 0u; index < len; ++index) {
      if (ids[index] == 0u) {
-        memcpy(ptr + slot_dim * index,
+        memcpy(ptr + slot_dim * index, init_value.data() + 2,
-               init_value.data() + 2, sizeof(float) * slot_dim);
+               sizeof(float) * slot_dim);
        continue;
      }
-      memcpy(ptr + slot_dim * index,
+      memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2,
-             fea_value[fea_idx].data() + 2, sizeof(float) * slot_dim);
+             sizeof(float) * slot_dim);
      fea_idx++;
    }
  }
@@ -534,31 +529,34 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
  auto fea_dim = _param_config->fea_dim;
  auto& features = _features[table_id];
  auto& push_g = _feature_push_value[table_id];
-  check_pull_push_memory(features, push_g, fea_dim);
+  check_pull_push_memory(features, &push_g, fea_dim);
-  CHECK(push_g.size() == features.size() + 1) <<
+  CHECK(push_g.size() == features.size() + 1)
-      "push_g size:" << push_g.size() << " features size:" << features.size();
+      << "push_g size:" << push_g.size()
+      << " features size:" << features.size();
  uint64_t fea_idx = 0u;
  auto& fea_info = _fea_info[table_id];
  int offset = 2;
  const std::vector<std::string>& feed_vec = thread_reader_->GetUseSlotAlias();
  // slot_idx = 0 is label
  for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) {
-    if (_param_config->slot_alias_to_table.find(
+    if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) ==
-            feed_vec[slot_idx]) == _param_config->slot_alias_to_table.end()) {
+        _param_config->slot_alias_to_table.end()) {
-      LOG(ERROR) << "ERROR slot_idx:" << slot_idx <<
+      LOG(ERROR) << "ERROR slot_idx:" << slot_idx
-          " name:" << feed_vec[slot_idx];
+                 << " name:" << feed_vec[slot_idx];
-    } else if (
+    } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] !=
-        _param_config->slot_alias_to_table[feed_vec[slot_idx]] != table_id) {
+               table_id) {
      continue;
    }
    Variable* g_var = thread_scope_->FindVar(
        _param_config->gradient_var[table_id][slot_idx - 1]);
-    CHECK(g_var != nullptr) << "var[" <<
+    CHECK(g_var != nullptr)
-        _param_config->gradient_var[table_id][slot_idx - 1] << "] not found";
+        << "var[" << _param_config->gradient_var[table_id][slot_idx - 1]
+        << "] not found";
    LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
    if (g_tensor == NULL) {
-      LOG(ERROR) << "var[" <<
+      LOG(ERROR) << "var["
-          _param_config->gradient_var[table_id][slot_idx - 1] << "] not found";
+                 << _param_config->gradient_var[table_id][slot_idx - 1]
+                 << "] not found";
      exit(-1);
    }
    float* g = g_tensor->data<float>();
@@ -571,28 +569,27 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
      exit(-1);
    }
    int len = tensor->numel();
-    CHECK(slot_dim * len == g_tensor->numel()) <<
+    CHECK(slot_dim * len == g_tensor->numel())
-        "len:" << len << " g_numel:" << g_tensor->numel();
+        << "len:" << len << " g_numel:" << g_tensor->numel();
-    CHECK(len == tensor->numel()) << "len:" <<
+    CHECK(len == tensor->numel()) << "len:" << len
-        len << "t_numel:" << tensor->numel();
+                                  << "t_numel:" << tensor->numel();
    int64_t* ids = tensor->data<int64_t>();
    for (auto id_idx = 0u; id_idx < len; ++id_idx) {
      if (ids[id_idx] == 0) {
        g += slot_dim;
        continue;
      }
-      memcpy(push_g[fea_idx].data() + offset,
+      memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
-             g, sizeof(float) * slot_dim);
      push_g[fea_idx][0] = 1.0f;
-      CHECK(fea_idx < fea_info.size()) << "fea_idx:" <<
+      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
-          fea_idx << " size:" << fea_info.size();
+                                       << " size:" << fea_info.size();
      push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
      g += slot_dim;
      fea_idx++;
    }
  }
-  CHECK(fea_idx == features.size()) << "fea_idx:" <<
+  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
-      fea_idx << " features size:" << features.size();
+                                    << " features size:" << features.size();
  CHECK_GT(features.size(), 0);
  std::vector<float*> push_g_vec;
@@ -600,13 +597,12 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
    push_g_vec.push_back(push_g[i].data());
  }
  auto status = _pslib_ptr->_worker_ptr->push_sparse(
-      table_id, features.data(),
+      table_id, features.data(), (const float**)push_g_vec.data(),
-      (const float**)push_g_vec.data(), features.size());
+      features.size());
  _push_sparse_status.push_back(std::move(status));
 }
-void AsyncExecutorThreadWorker::collect_feasign_info(
+void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) {
-    int table_id) {
  auto& fea_info = _fea_info[table_id];
  auto& feature = _features[table_id];
  fea_info.resize(feature.size());
@@ -633,31 +629,28 @@ void AsyncExecutorThreadWorker::collect_feasign_info(
      }
    }
  }
-  CHECK(global_index == feature.size()) <<
+  CHECK(global_index == feature.size())
-      "expect fea info size:" << feature.size()
+      << "expect fea info size:" << feature.size() << " real:" << global_index;
-                              << " real:" << global_index;
 }
 void AsyncExecutorThreadWorker::check_pull_push_memory(
    const std::vector<uint64_t>& features,
-        std::vector<std::vector<float>>& push_g,
+    std::vector<std::vector<float>>* push_g, int dim) {
-        int dim) {
+  push_g->resize(features.size() + 1);
-  push_g.resize(features.size() + 1);
+  for (auto& t : *push_g) {
-  for (auto& t : push_g) {
    t.resize(dim);
  }
 }
 void AsyncExecutorThreadWorker::check_pull_push_memory(
-    const std::vector<uint64_t>& features,
+    const std::vector<uint64_t>& features, std::vector<float*>* push_g,
-    std::vector<float*>& push_g,
    int dim) {
-  if (features.size() > push_g.size()) {
+  if (features.size() > push_g->size()) {
-    push_g.reserve(features.size() + 1);
+    push_g->reserve(features.size() + 1);
-    auto size = features.size() - push_g.size() + 1;
+    auto size = features.size() - push_g->size() + 1;
    for (auto i = 0u; i < size; ++i) {
      float* ptr = new float[dim];
-      push_g.push_back(ptr);
+      push_g->push_back(ptr);
    }
  }
 }

--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #ifdef PADDLE_WITH_PSLIB
-#include "pslib.h"
+#include <pslib.h>
 #endif
 namespace paddle {
@@ -34,7 +34,7 @@ namespace framework {
 void CreateTensor(Variable* var, proto::VarType::Type var_type);
 #ifdef PADDLE_WITH_PSLIB
-const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
+static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
 struct AsyncWorkerParamConfig {
  int slot_dim;
@@ -66,8 +66,8 @@ struct DensePullThreadParam {
 class DensePullThread {
 public:
-  explicit DensePullThread(const DensePullThreadParam& param) :
+  explicit DensePullThread(const DensePullThreadParam& param)
-  _running(false) {
+      : _running(false) {
    _ps_client = param.ps_client;
    _threshold = param.threshold;
    _thread_num = param.training_thread_num;
@@ -75,8 +75,7 @@ class DensePullThread {
    _sleep_time_ms = param.sleep_time_ms;
    for (auto& t : *param.dense_params) {
-      _dense_variable_name[t.first].insert(
+      _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
-          _dense_variable_name[t.first].end(),
                                           t.second.begin(), t.second.end());
      _training_versions[t.first].resize(_thread_num, 0);
      _last_versions[t.first] = 0;
@@ -136,7 +135,7 @@ class DensePullThread {
 class ExecutorThreadWorker {
 public:
-ExecutorThreadWorker()
+  ExecutorThreadWorker()
      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
  virtual ~ExecutorThreadWorker() {}
@@ -161,10 +160,8 @@ ExecutorThreadWorker()
 #ifdef PADDLE_WITH_PSLIB
  virtual void SetPSlibPtr(
      std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
-  virtual void SetPullDenseThread(
+  virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
-      std::shared_ptr<DensePullThread> dpt) {}
+  virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
-  virtual void SetParamConfig(
-      AsyncWorkerParamConfig * param_config) {}
 #endif
 private:
@@ -195,7 +192,7 @@ ExecutorThreadWorker()
 };
 #ifdef PADDLE_WITH_PSLIB
-class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
+class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
 public:
  AsyncExecutorThreadWorker() {}
  virtual ~AsyncExecutorThreadWorker() {}
@@ -211,13 +208,10 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
  void PushSparse(int table_id);
  void PushDense(int table_id);
-  void check_pull_push_memory(
-      const std::vector<uint64_t>& features,
-      std::vector<float*>& push_g,
-      int dim);
  void check_pull_push_memory(const std::vector<uint64_t>& features,
-                              std::vector<std::vector<float>>& push_g,
+                              std::vector<float*>* push_g, int dim);
-                              int dim);
+  void check_pull_push_memory(const std::vector<uint64_t>& features,
+                              std::vector<std::vector<float>>* push_g, int dim);
  void collect_feasign_info(int table_id);
 private:
@@ -232,7 +226,6 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
  std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
  std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
  std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
  std::shared_ptr<DensePullThread> _pull_dense_thread;
@@ -243,7 +236,6 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
  std::vector<::std::future<int32_t>> _push_dense_status;
  AsyncWorkerParamConfig* _param_config;
 };
 #endif