add service (#29560)

* add service, remove ut on mac * fix heter_profiler & add heter stop method * fix code style

add service (#29560)
* add service, remove ut on mac * fix heter_profiler & add heter stop method * fix code style
0034273b · tangwei12 · GitHub · c0163837 · 0034273b · 0034273b
31 changed file
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -14,3 +14,17 @@ endif()
 add_subdirectory(table)
 add_subdirectory(test)
+# open it until CI support brpc
+return()
+add_subdirectory(service)
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(fleet
+        SRCS fleet.cc
+        DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS})
+target_link_libraries(fleet z)
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/distributed/fleet.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace distributed {
+using framework::LoDTensor;
+using framework::ProgramDesc;
+using framework::VarDesc;
+using framework::Variable;
+const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100;
+std::shared_ptr<FleetWrapper> FleetWrapper::s_instance_ = NULL;
+bool FleetWrapper::is_initialized_ = false;
+std::shared_ptr<paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ = NULL;
+void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms,
+                                          int connect_timeout_ms,
+                                          int max_retry) {
+  client2client_request_timeout_ms_ = request_timeout_ms;
+  client2client_connect_timeout_ms_ = connect_timeout_ms;
+  client2client_max_retry_ = max_retry;
+}
+void FleetWrapper::LoadSparseOnServer(const std::string& path,
+                                      const std::string& meta,
+                                      uint32_t table_id) {
+  VLOG(3) << "load sparse table " << table_id << " with " << path << " meta "
+          << meta;
+  pserver_ptr_->_server_ptr->table(table_id)->load(path, meta);
+}
+void FleetWrapper::InitServer(const std::string& dist_desc,
+                              const std::vector<std::string>& host_sign_list,
+                              int index) {
+  if (!is_initialized_) {
+    VLOG(3) << "Going to init server";
+    pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
+        new paddle::distributed::PSCore());
+    pserver_ptr_->init_server(dist_desc, &host_sign_list, host_sign_list.size(),
+                              index);
+    is_initialized_ = true;
+  } else {
+    VLOG(3) << "Server can be initialized only once";
+  }
+}
+// void FleetWrapper::InitWorker(
+//     const std::string& dist_desc, const std::vector<uint64_t>&
+//     host_sign_list, Scope* scope, const RpcCtxMap& send_ctx, const
+//     std::unordered_map<uint64_t, std::vector<std::string>>&
+//         dense_varnames,
+//     const std::map<std::string, std::string>& envs, int node_num, int index)
+//     {
+//   if (!is_initialized_) {
+//     VLOG(3) << "Going to init worker";
+//     Communicator::InitInstance<AsyncCommunicator>(
+//         send_ctx, dense_varnames, dist_desc, host_sign_list, scope, envs);
+//     pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
+//         new paddle::distributed::PSCore());
+//     pserver_ptr_->init_worker(dist_desc, _regions,
+//                               const_cast<uint64_t*>(host_sign_list.data()),
+//                               node_num, index);
+//     is_initialized_ = true;
+//   } else {
+//     VLOG(3) << "Worker can be initialized only once";
+//   }
+// }
+void FleetWrapper::InitWorker(
+    const std::string& dist_desc,
+    const std::vector<std::string>& host_sign_list, Scope* scope,
+    const RpcCtxMap& send_ctx,
+    const std::unordered_map<uint64_t, std::vector<std::string>>&
+        dense_varnames,
+    const std::map<std::string, std::string>& envs, int node_num, int index) {
+  if (!is_initialized_) {
+    VLOG(3) << "Going to init worker";
+    Communicator::InitInstance<AsyncCommunicator>(
+        send_ctx, dense_varnames, dist_desc, host_sign_list, scope, envs);
+    pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
+        new paddle::distributed::PSCore());
+    pserver_ptr_->init_worker(dist_desc, _regions, &host_sign_list, node_num,
+                              index);
+    is_initialized_ = true;
+  } else {
+    VLOG(3) << "Worker can be initialized only once";
+  }
+}
+void FleetWrapper::StopServer() {
+  VLOG(3) << "Going to stop server";
+  auto* communicator = Communicator::GetInstance();
+  auto status = communicator->_worker_ptr->stop_server();
+  status.wait();
+}
+void FleetWrapper::FinalizeWorker() {
+  VLOG(3) << "Going to finalize worker";
+  pserver_ptr_->finalize_worker();
+}
+void FleetWrapper::BarrierWithTable(uint32_t barrier_type) {
+  VLOG(3) << "Going to Barrier worker";
+  auto* communicator = Communicator::GetInstance();
+  communicator->BarrierWithTable(barrier_type);
+}
+uint64_t FleetWrapper::RunServer(const std::string& ip, uint32_t port) {
+  VLOG(3) << "Going to run server with ip " << ip << " port " << port;
+  auto ret = pserver_ptr_->run_server(ip, port);
+  return ret;
+}
+std::vector<uint64_t> FleetWrapper::GetClientsInfo() {
+  VLOG(3) << "Going to get client info";
+  return pserver_ptr_->get_client_info();
+  return std::vector<uint64_t>();
+}
+void FleetWrapper::CreateClient2ClientConnection() {
+  VLOG(3) << "Going to create client2client connection";
+  pserver_ptr_->create_client2client_connection(
+      client2client_request_timeout_ms_, client2client_connect_timeout_ms_,
+      client2client_max_retry_);
+}
+std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
+    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
+  fea_keys->clear();
+  fea_keys->resize(0);
+  fea_keys->reserve(MAX_FEASIGN_NUM);
+  for (auto name : var_names) {
+    Variable* var = scope.FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+    for (auto i = 0u; i < len; ++i) {
+      if (ids[i] == 0u) {
+        continue;
+      }
+      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  fea_values->resize(fea_keys->size() + 1);
+  for (auto& t : *fea_values) {
+    t.resize(fea_value_dim);
+  }
+  std::vector<float*> pull_result_ptr;
+  for (auto& t : *fea_values) {
+    pull_result_ptr.push_back(t.data());
+  }
+  return pserver_ptr_->_worker_ptr->pull_sparse(
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+}
+void FleetWrapper::PullSparseVarsSync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
+    std::vector<std::vector<float>>* fea_values, int fea_value_dim,
+    const std::vector<std::string>& var_emb_names) {
+  std::vector<std::future<int32_t>> pull_sparse_status;
+  pull_sparse_status.resize(0);
+  fea_keys->clear();
+  fea_keys->resize(0);
+  fea_keys->reserve(MAX_FEASIGN_NUM);
+  for (size_t var_index = 0; var_index < var_names.size(); ++var_index) {
+    const std::string& name = var_names[var_index];
+    Variable* var = scope.FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+    // skip slots which do not have embedding
+    const std::string& emb_name = var_emb_names[var_index];
+    Variable* emb_var = scope.FindVar(emb_name);
+    if (emb_var == nullptr) {
+      continue;
+    }
+    for (auto i = 0u; i < len; ++i) {
+      if (ids[i] == 0u) {
+        continue;
+      }
+      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  fea_values->resize(fea_keys->size() + 1);
+  for (auto& t : *fea_values) {
+    t.resize(fea_value_dim);
+  }
+  std::vector<float*> pull_result_ptr;
+  for (auto& t : *fea_values) {
+    pull_result_ptr.push_back(t.data());
+  }
+  auto status = pserver_ptr_->_worker_ptr->pull_sparse(
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+  pull_sparse_status.push_back(std::move(status));
+  for (auto& t : pull_sparse_status) {
+    t.wait();
+    auto status = t.get();
+    if (status != 0) {
+      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+      sleep(sleep_seconds_before_fail_exit_);
+      exit(-1);
+    }
+  }
+}
+void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
+                                          uint64_t padding_id,
+                                          platform::Place place,
+                                          std::vector<const LoDTensor*>* inputs,
+                                          std::vector<LoDTensor*>* outputs) {
+  std::vector<uint64_t> fea_keys;
+  std::vector<float*> pull_result_ptr;
+  fea_keys.reserve(MAX_FEASIGN_NUM / 100);
+  pull_result_ptr.reserve(MAX_FEASIGN_NUM / 100);
+  std::vector<float> init_value(fea_dim, 0);
+  framework::LoDTensor* output = nullptr;
+  float* output_data = nullptr;
+  size_t output_index = -1;
+  size_t output_len = 0;
+  for (size_t index = 0; index < inputs->size(); ++index) {
+    const framework::LoDTensor* tensor = inputs->at(index);
+    const int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+    for (size_t i = 0; i < len; ++i, output_len += fea_dim) {
+      if (!output || output_len == size_t(output->numel())) {
+        ++output_index;
+        CHECK(output_index < outputs->size());  // NOLINT
+        output = outputs->at(output_index);
+        output->set_lod(tensor->lod());
+        output_data = output->mutable_data<float>(place);
+        output_len = 0;
+        CHECK(output->numel() % fea_dim == 0);  // NOLINT
+        CHECK(output_data != nullptr);          // NOLINT
+      }
+      uint64_t real_id = static_cast<uint64_t>(ids[i]);
+      if (real_id == padding_id) {
+        memcpy(output_data + output_len, init_value.data(),
+               sizeof(float) * fea_dim);
+        continue;
+      }
+      fea_keys.push_back(real_id);
+      pull_result_ptr.push_back(output_data + output_len);
+    }
+  }
+  auto* communicator = Communicator::GetInstance();
+  auto status = communicator->_worker_ptr->pull_sparse(
+      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size());
+  status.wait();
+  auto ret = status.get();
+  if (ret != 0) {
+    LOG(ERROR) << "fleet pull sparse failed, status[" << ret << "]";
+    sleep(sleep_seconds_before_fail_exit_);
+  }
+}
+void FleetWrapper::PullDenseVarsAsync(
+    const Scope& scope, const uint64_t tid,
+    const std::vector<std::string>& var_names,
+    std::vector<std::future<int32_t>>* pull_dense_status, bool in_cpu) {
+  auto& regions = _regions[tid];
+  regions.clear();
+  regions.resize(var_names.size());
+  for (auto i = 0u; i < var_names.size(); ++i) {
+    std::string varname = var_names[i];
+    if (!in_cpu) {
+      varname = var_names[i] + "pin";
+    }
+    Variable* var = scope.FindVar(varname);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    float* w = tensor->data<float>();
+    paddle::distributed::Region reg(w, tensor->numel());
+    regions[i] = std::move(reg);
+  }
+  auto status = pserver_ptr_->_worker_ptr->pull_dense(regions.data(),
+                                                      regions.size(), tid);
+  pull_dense_status->push_back(std::move(status));
+}
+void FleetWrapper::PullDenseVarsSync(
+    const Scope& scope, const uint64_t tid,
+    const std::vector<std::string>& var_names) {
+  auto& regions = _regions[tid];
+  regions.clear();
+  regions.reserve(var_names.size());
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    float* w = tensor->data<float>();
+    paddle::distributed::Region reg(w, tensor->numel());
+    regions.emplace_back(std::move(reg));
+  }
+  auto* communicator = Communicator::GetInstance();
+  auto status = communicator->_worker_ptr->pull_dense(regions.data(),
+                                                      regions.size(), tid);
+  status.wait();
+}
+void FleetWrapper::PushDenseParamSync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names) {
+  auto place = platform::CPUPlace();
+  std::vector<paddle::distributed::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    float* g = tensor->mutable_data<float>(place);
+    paddle::distributed::Region reg(g, tensor->numel());
+    regions.emplace_back(std::move(reg));
+  }
+  auto* communicator = Communicator::GetInstance();
+  auto push_status = communicator->_worker_ptr->push_dense_param(
+      regions.data(), regions.size(), table_id);
+  push_status.wait();
+  auto status = push_status.get();
+  CHECK(status == 0) << "push dense param failed, status[" << status << "]";
+}
+void FleetWrapper::PushDenseVarsSync(
+    Scope* scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names) {}
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<std::future<int32_t>>* push_sparse_status, float scale_datanorm,
+    int batch_size) {
+  auto* communicator = Communicator::GetInstance();
+  PADDLE_ENFORCE_EQ(
+      communicator->Check(table_id), true,
+      platform::errors::InvalidArgument(
+          "can not find table: %s, please check your config", table_id));
+  communicator->Send(var_names, scope);
+}
+void FleetWrapper::PushSparseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::string& grad_varname,
+    std::vector<std::future<int32_t>>* push_sparse_status) {
+  std::vector<std::string> varnames;
+  varnames.push_back(grad_varname);
+  auto* communicator = Communicator::GetInstance();
+  PADDLE_ENFORCE_EQ(
+      communicator->Check(table_id), true,
+      platform::errors::InvalidArgument(
+          "can not find table: %s, please check your config", table_id));
+  communicator->Send(varnames, scope);
+}
+void FleetWrapper::PushSparseVarsWithLabelAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<uint64_t>& fea_keys, const std::vector<float>& fea_labels,
+    const std::vector<std::string>& sparse_key_names,
+    const std::vector<std::string>& sparse_grad_names, const int emb_dim,
+    std::vector<std::vector<float>>* push_values,
+    std::vector<std::future<int32_t>>* push_sparse_status, const int batch_size,
+    const bool use_cvm, const bool dump_slot,
+    std::vector<uint64_t>* sparse_push_keys, const bool no_cvm) {
+  // not support
+  return;
+}
+void FleetWrapper::PushSparseFromTensorWithLabelAsync(
+    const Scope& scope, const uint64_t table_id, int fea_dim,
+    uint64_t padding_id, bool scale_sparse, const std::string& accesor,
+    const std::string& click_name, platform::Place place,
+    const std::vector<std::string>& input_names,
+    std::vector<const LoDTensor*>* inputs,
+    std::vector<const LoDTensor*>* outputs) {
+  // not support
+  return;
+}
+void FleetWrapper::LoadModel(const std::string& path, const int mode) {
+  auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
+  ret.wait();
+  if (ret.get() != 0) {
+    LOG(ERROR) << "load model from path:" << path << " failed";
+    sleep(sleep_seconds_before_fail_exit_);
+    exit(-1);
+  }
+}
+void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
+                                     const std::string& path, const int mode) {
+  auto ret =
+      pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
+  ret.wait();
+  if (ret.get() != 0) {
+    LOG(ERROR) << "load model of table id: " << table_id
+               << ", from path: " << path << " failed";
+  }
+}
+void FleetWrapper::SaveModel(const std::string& path, const int mode) {
+  auto* communicator = Communicator::GetInstance();
+  auto ret = communicator->_worker_ptr->save(path, std::to_string(mode));
+  ret.wait();
+  int32_t feasign_cnt = ret.get();
+  if (feasign_cnt == -1) {
+    LOG(ERROR) << "save model failed";
+    sleep(sleep_seconds_before_fail_exit_);
+    exit(-1);
+  }
+}
+void FleetWrapper::SaveModelOneTable(const uint64_t table_id,
+                                     const std::string& path, const int mode) {
+  auto* communicator = Communicator::GetInstance();
+  auto ret =
+      communicator->_worker_ptr->save(table_id, path, std::to_string(mode));
+  ret.wait();
+  if (ret.get() != 0) {
+    LOG(ERROR) << "save model of table id: " << table_id
+               << ", to path: " << path << " failed";
+  }
+}
+void FleetWrapper::PrintTableStat(const uint64_t table_id) {
+  auto* communicator = Communicator::GetInstance();
+  auto ret = communicator->_worker_ptr->print_table_stat(table_id);
+  ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "print table stat failed";
+  }
+}
+void FleetWrapper::ShrinkSparseTable(int table_id) {
+  auto ret = pserver_ptr_->_worker_ptr->shrink(table_id);
+  ret.wait();
+}
+void FleetWrapper::ClearModel() {
+  auto ret = pserver_ptr_->_worker_ptr->clear();
+  ret.wait();
+}
+void FleetWrapper::ClearOneTable(const uint64_t table_id) {
+  auto ret = pserver_ptr_->_worker_ptr->clear(table_id);
+  ret.wait();
+}
+void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
+                                    std::vector<std::string> var_list,
+                                    float decay, int emb_dim) {
+  std::vector<paddle::distributed::Region> regions;
+  for (std::string& name : var_list) {
+    if (name.find("batch_sum") != std::string::npos) {
+      Variable* var = scope->FindVar(name);
+      CHECK(var != nullptr) << "var[" << name << "] not found";
+      VLOG(0) << "prepare shrink dense batch_sum";
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      float* g = tensor->data<float>();
+      // show_batch_sum += N * log(decay)
+      std::string size_name = name;
+      size_name.replace(size_name.find("batch_sum"), size_name.length(),
+                        "batch_size");
+      Variable* var_size = scope->FindVar(size_name);
+      CHECK(var_size != nullptr) << "var[" << size_name << "] not found";
+      VLOG(3) << "shrink dense batch_sum: " << name << ", " << size_name;
+      float* g_size = var_size->GetMutable<LoDTensor>()->data<float>();
+      for (int k = 0; k < tensor->numel(); k += emb_dim) {
+        g[k] = g[k] + g_size[k] * log(decay);
+      }
+      paddle::distributed::Region reg(g, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    } else {
+      Variable* var = scope->FindVar(name);
+      CHECK(var != nullptr) << "var[" << name << "] not found";
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      float* g = tensor->data<float>();
+      paddle::distributed::Region reg(g, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    }
+  }
+  auto push_status = pserver_ptr_->_worker_ptr->push_dense_param(
+      regions.data(), regions.size(), table_id);
+  push_status.wait();
+  auto status = push_status.get();
+  if (status != 0) {
+    // PADDLE_THORW(platform::errors::Fatal(
+    //    "push shrink dense param failed, status is [%d].", status));
+    sleep(sleep_seconds_before_fail_exit_);
+    exit(-1);
+  }
+}
+void FleetWrapper::ClientFlush() {
+  auto ret = pserver_ptr_->_worker_ptr->flush();
+  ret.wait();
+}
+int FleetWrapper::RegisterClientToClientMsgHandler(int msg_type,
+                                                   MsgHandlerFunc handler) {
+  VLOG(3) << "calling FleetWrapper::RegisterClientToClientMsgHandler";
+  VLOG(3) << "pserver_ptr_=" << pserver_ptr_;
+  VLOG(3) << "_worker_ptr=" << pserver_ptr_->_worker_ptr;
+  return pserver_ptr_->_worker_ptr->registe_client2client_msg_handler(msg_type,
+                                                                      handler);
+}
+std::future<int32_t> FleetWrapper::SendClientToClientMsg(
+    int msg_type, int to_client_id, const std::string& msg) {
+  return pserver_ptr_->_worker_ptr->send_client2client_msg(msg_type,
+                                                           to_client_id, msg);
+}
+std::default_random_engine& FleetWrapper::LocalRandomEngine() {
+  struct engine_wrapper_t {
+    std::default_random_engine engine;
+    engine_wrapper_t() {
+      struct timespec tp;
+      clock_gettime(CLOCK_REALTIME, &tp);
+      double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
+      static std::atomic<uint64_t> x(0);
+      std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
+      engine.seed(sseq);
+    }
+  };
+  thread_local engine_wrapper_t r;
+  return r.engine;
+}
+size_t FleetWrapper::GetAbsoluteSum(size_t start, size_t end, size_t level,
+                                    const framework::LoD& lod) {
+  if (level >= lod.size() - 1) {
+    return end - start;
+  }
+  size_t ret = 0;
+  for (size_t i = start; i < end - 1; ++i) {
+    size_t pos1 = lod[level][i];
+    size_t pos2 = lod[level][i + 1];
+    ret += GetAbsoluteSum(pos1, pos2, level + 1, lod);
+  }
+  return ret;
+}
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <ThreadPool.h>
+#include "paddle/fluid/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+namespace paddle {
+namespace distributed {
+using framework::LoDTensor;
+using framework::Scope;
+using framework::SelectedRows;
+using framework::Variable;
+using RpcCtxMap = std::unordered_map<std::string, CommContext>;
+class FleetWrapper {
+ public:
+  virtual ~FleetWrapper() {}
+  FleetWrapper() {
+    scale_sparse_gradient_with_batch_size_ = true;
+    // trainer sleep some time for pserver core dump
+    sleep_seconds_before_fail_exit_ = 300;
+    // pserver request server timeout ms
+    client2client_request_timeout_ms_ = 500000;
+    // pserver connect server timeout_ms
+    client2client_connect_timeout_ms_ = 10000;
+    // pserver request max retry
+    client2client_max_retry_ = 3;
+  }
+  // set client to client communication config
+  void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
+                              int max_retry);
+  // Pull sparse variables from server in sync mode
+  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
+  // Param<out>: fea_values
+  void PullSparseVarsSync(const Scope& scope, const uint64_t table_id,
+                          const std::vector<std::string>& var_names,
+                          std::vector<uint64_t>* fea_keys,
+                          std::vector<std::vector<float>>* fea_values,
+                          int fea_dim,
+                          const std::vector<std::string>& var_emb_names);
+  // Pull sparse variables from server in async mode
+  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim
+  // Param<out>: fea_values std::future
+  std::future<int32_t> PullSparseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<uint64_t>* fea_keys,
+      std::vector<std::vector<float>>* fea_values, int fea_dim);
+  // Pull sparse variables from server in sync mode
+  // pull immediately to tensors
+  void PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
+                              uint64_t padding_id, platform::Place place,
+                              std::vector<const LoDTensor*>* inputs,  // NOLINT
+                              std::vector<LoDTensor*>* outputs);      // NOLINT
+  // pull dense variables from server in sync mod
+  // Param<in>: scope, table_id, var_names
+  // Param<out>: void
+  void PullDenseVarsSync(const Scope& scope, const uint64_t table_id,
+                         const std::vector<std::string>& var_names);
+  // pull dense variables from server in async mod
+  // Param<in>: scope, table_id, var_names
+  // Param<out>: pull_dense_status
+  void PullDenseVarsAsync(const Scope& scope, const uint64_t table_id,
+                          const std::vector<std::string>& var_names,
+                          std::vector<std::future<int32_t>>* pull_dense_status,
+                          bool in_cpu);
+  // push dense parameters(not gradients) to server in sync mode
+  void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
+                          const std::vector<std::string>& var_names);
+  void PushDenseVarsAsync(const Scope& scope, const uint64_t table_id,
+                          const std::vector<std::string>& var_names,
+                          std::vector<std::future<int32_t>>* push_sparse_status,
+                          float scale_datanorm, int batch_size);
+  // push dense variables to server in sync mode
+  void PushDenseVarsSync(Scope* scope, const uint64_t table_id,
+                         const std::vector<std::string>& var_names);
+  void PushSparseVarsAsync(
+      const Scope& scope, const uint64_t table_id, const std::string& grad,
+      std::vector<std::future<int32_t>>* push_sparse_status);
+  // This is specially designed for click/show stats in server
+  // Param<in>: scope, table_id, fea_keys, fea_labels, sparse_key_names,
+  //            sparse_grad_names, batch_size, use_cvm, dump_slot
+  // Param<out>: push_values, push_sparse_status
+  void PushSparseVarsWithLabelAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<uint64_t>& fea_keys,
+      const std::vector<float>& fea_labels,
+      const std::vector<std::string>& sparse_key_names,
+      const std::vector<std::string>& sparse_grad_names, const int emb_dim,
+      std::vector<std::vector<float>>* push_values,
+      std::vector<std::future<int32_t>>* push_sparse_status,
+      const int batch_size, const bool use_cvm, const bool dump_slot,
+      std::vector<uint64_t>* sparse_push_keys, const bool no_cvm);
+  // Push sparse variables to server in async mode
+  void PushSparseFromTensorWithLabelAsync(
+      const Scope& scope, const uint64_t table_id, int fea_dim,
+      uint64_t padding_id, bool scale_sparse, const std::string& accesor,
+      const std::string& click_name, platform::Place place,
+      const std::vector<std::string>& input_names,
+      std::vector<const LoDTensor*>* inputs,    // NOLINT
+      std::vector<const LoDTensor*>* outputs);  // NOLINT
+  // Push sparse variables to server in Async mode
+  // Param<In>: scope, table_id, fea_keys, sparse_grad_names
+  // Param<Out>: push_values, push_sparse_status
+  // init server
+  void LoadSparseOnServer(const std::string& path, const std::string& meta,
+                          uint32_t table_id);
+  // init server
+  // void InitServer(const std::string& dist_desc,
+  //                 const std::vector<uint64_t>& host_sign_list, int index);
+  void InitServer(const std::string& dist_desc,
+                  const std::vector<std::string>& host_sign_list, int index);
+  // init trainer
+  void InitWorker(const std::string& dist_desc,
+                  const std::vector<std::string>& host_sign_list, Scope* scope,
+                  const RpcCtxMap& send_ctx,
+                  const std::unordered_map<uint64_t, std::vector<std::string>>&
+                      dense_varnames,
+                  const std::map<std::string, std::string>& envs, int node_num,
+                  int index);
+  // stop server
+  void StopServer();
+  // finalize worker to make worker can be stop
+  void FinalizeWorker();
+  // run server with ip port
+  uint64_t RunServer(const std::string& ip, uint32_t port);
+  // get client info
+  std::vector<uint64_t> GetClientsInfo();
+  // create client to client connection
+  void CreateClient2ClientConnection();
+  // flush all push requests
+  void ClientFlush();
+  // barrier with barrier table
+  void BarrierWithTable(uint32_t barrier_type);
+  void PrintTableStat(const uint64_t table_id);
+  // mode = 0, load all feature
+  // mode = 1, load delta feature, which means load diff
+  void LoadModel(const std::string& path, const int mode);
+  // mode = 0, load all feature
+  // mode = 1, load delta feature, which means load diff
+  void LoadModelOneTable(const uint64_t table_id, const std::string& path,
+                         const int mode);
+  // mode = 0, save all feature
+  // mode = 1, save delta feature, which means save diff
+  void SaveModel(const std::string& path, const int mode);
+  // mode = 0, save all feature
+  // mode = 1, save delta feature, which means save diff
+  void SaveModelOneTable(const uint64_t table_id, const std::string& path,
+                         const int mode);
+  // clear all models, release their memory
+  void ClearModel();
+  // clear one table
+  void ClearOneTable(const uint64_t table_id);
+  // shrink sparse table
+  void ShrinkSparseTable(int table_id);
+  // shrink dense table
+  void ShrinkDenseTable(int table_id, Scope* scope,
+                        std::vector<std::string> var_list, float decay,
+                        int emb_dim);
+  typedef std::function<int32_t(int, int, const std::string&)> MsgHandlerFunc;
+  // register client to client communication
+  int RegisterClientToClientMsgHandler(int msg_type, MsgHandlerFunc handler);
+  // send client to client message
+  std::future<int32_t> SendClientToClientMsg(int msg_type, int to_client_id,
+                                             const std::string& msg);
+  // FleetWrapper singleton
+  static std::shared_ptr<FleetWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::FleetWrapper());
+    }
+    return s_instance_;
+  }
+  // this performs better than rand_r, especially large data
+  std::default_random_engine& LocalRandomEngine();
+  static std::shared_ptr<paddle::distributed::PSCore> pserver_ptr_;
+ private:
+  static std::shared_ptr<FleetWrapper> s_instance_;
+  size_t GetAbsoluteSum(size_t start, size_t end, size_t level,
+                        const framework::LoD& lod);
+ protected:
+  static bool is_initialized_;
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> _regions;
+  bool scale_sparse_gradient_with_batch_size_;
+  int32_t sleep_seconds_before_fail_exit_;
+  int client2client_request_timeout_ms_;
+  int client2client_connect_timeout_ms_;
+  int client2client_max_retry_;
+  DISABLE_COPY_AND_ASSIGN(FleetWrapper);
+};
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
+set(BRPC_SRCS ps_client.cc server.cc)
+set_source_files_properties(${BRPC_SRCS})
+set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog)
+brpc_library(sendrecv_rpc SRCS
+        ${BRPC_SRCS}
+        PROTO sendrecv.proto
+        DEPS ${BRPC_DEPS} )
+set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper)
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table ${RPC_DEPS})
+cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table ${RPC_DEPS})
+cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
+cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
+cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
+cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS})
+cc_library(brpc_utils SRCS brpc_utils.cc DEPS ${COMMON_DEPS} ${RPC_DEPS})
+cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+const static int max_port = 65535;
+DEFINE_int32(pserver_push_dense_merge_limit, 12,
+             "limit max push_dense local merge requests");
+DEFINE_int32(pserver_push_sparse_merge_limit, 12,
+             "limit max push_sparse local merge requests");
+DEFINE_int32(pserver_pull_dense_limit, 12,
+             "limit max push_sparse local merge requests");
+DEFINE_int32(pserver_async_push_dense_interval_ms, 10,
+             "async push_dense to server interval");
+DEFINE_int32(pserver_async_push_sparse_interval_ms, 10,
+             "async push_sparse to server interval");
+DEFINE_bool(pserver_scale_gradient_by_merge, false,
+            "scale dense gradient when merged");
+DEFINE_int32(pserver_communicate_compress_type, 0,
+             "none:0 snappy:1 gzip:2 zlib:3 lz4:4");
+DEFINE_int32(pserver_max_async_call_num, 13,
+             "max task num in async_call_server");
+DEFINE_int32(pserver_timeout_ms, 500000, "pserver request server timeout_ms");
+DEFINE_int32(pserver_connect_timeout_ms, 10000,
+             "pserver connect server timeout_ms");
+DEFINE_int32(pserver_sparse_merge_thread, 1, "pserver sparse merge thread num");
+namespace paddle {
+namespace distributed {
+inline size_t get_sparse_shard(uint32_t shard_num, uint32_t server_num,
+                               uint64_t key) {
+  size_t remind = shard_num % server_num;
+  size_t local_shard_num =
+      remind == 0 ? shard_num / server_num : shard_num / server_num + 1;
+  return (key % shard_num) / local_shard_num;
+}
+void DownpourPsClientService::service(
+    ::google::protobuf::RpcController *controller,
+    const ::paddle::PsRequestMessage *request,
+    ::paddle::PsResponseMessage *response, ::google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  int ret = _client->handle_client2client_msg(
+      request->cmd_id(), request->client_id(), request->data());
+  response->set_err_code(0);
+  response->set_err_msg("");
+  if (ret != 0) {
+    response->set_err_code(-1);
+    response->set_err_msg("handle_client2client_msg failed");
+  }
+}
+// 启动client端RpcService 用于数据互发等操作
+int32_t BrpcPsClient::start_client_service() {
+  if (_service.configure(this, _client_id) != 0) {
+    LOG(ERROR)
+        << "service initialize failed, service_name:DownpourPsClientService";
+    return -1;
+  }
+  _server.AddService(&_service, brpc::SERVER_DOESNT_OWN_SERVICE);
+  brpc::ServerOptions options;
+  int start_port = 8500;
+  options.num_threads = 24;
+  if (_server.Start(butil::my_ip_cstr(), brpc::PortRange(start_port, max_port),
+                    &options) != 0) {
+    LOG(ERROR) << "BrpcPsServer start failed";
+    return -1;
+  }
+  _env->registe_ps_client(butil::my_ip_cstr(), _server.listen_address().port,
+                          _client_id);
+  return 0;
+}
+int32_t BrpcPsClient::create_client2client_connection(
+    int pserver_timeout_ms, int pserver_connect_timeout_ms, int max_retry) {
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.timeout_ms = pserver_timeout_ms;
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = pserver_connect_timeout_ms;
+  options.max_retry = max_retry;
+  std::vector<PSHost> client_list = _env->get_ps_clients();
+  _client_channels.resize(client_list.size());
+  std::ostringstream os;
+  std::string server_ip_port;
+  for (size_t i = 0; i < client_list.size(); ++i) {
+    server_ip_port.assign(client_list[i].ip.c_str());
+    server_ip_port.append(":");
+    server_ip_port.append(std::to_string(client_list[i].port));
+    _client_channels[i].reset(new brpc::Channel());
+    if (_client_channels[i]->Init(server_ip_port.c_str(), "", &options) != 0) {
+      LOG(ERROR) << "psclient connect to client:" << server_ip_port
+                 << " Failed!";
+    }
+    os << server_ip_port << ",";
+  }
+  LOG(INFO) << "Client connect success:" << os.str();
+  return 0;
+}
+int32_t BrpcPsClient::initialize() {
+  _async_call_num = 0;
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.timeout_ms = FLAGS_pserver_timeout_ms;
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = FLAGS_pserver_connect_timeout_ms;
+  options.max_retry = 3;
+  std::ostringstream os;
+  std::string server_ip_port;
+  std::string client_ip(butil::my_ip_cstr());
+  // 获取server列表，并连接
+  std::vector<PSHost> server_list = _env->get_ps_servers();
+  _server_channels.resize(server_list.size());
+  for (size_t i = 0; i < server_list.size(); ++i) {
+    server_ip_port.assign(server_list[i].ip.c_str());
+    server_ip_port.append(":");
+    server_ip_port.append(std::to_string(server_list[i].port));
+    for (size_t j = 0; j < _server_channels[i].size(); ++j) {
+      _server_channels[i][j].reset(new brpc::Channel());
+      if (_server_channels[i][j]->Init(server_ip_port.c_str(), "", &options) !=
+          0) {
+        LOG(ERROR) << "psclient connect to server:" << server_ip_port
+                   << " Failed!";
+        return -1;
+      }
+    }
+    os << server_ip_port << ",";
+  }
+  // 启动client探听接口, 并相互建立连接
+  start_client_service();
+  _running = true;
+  _flushing = false;
+  return 0;
+}
+int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
+  if (_cntls[request_idx]->Failed()) {
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
+                                                  "err:"
+               << _cntls[request_idx]->ErrorText();
+    return -1;
+  }
+  if (_responses[request_idx].err_code() != 0) {
+    LOG(ERROR) << "response ret bad, server_idx:" << request_idx
+               << "cmd_id:" << cmd_id
+               << " err_code:" << _responses[request_idx].err_code()
+               << " err_msg:" << _responses[request_idx].err_msg();
+    return -1;
+  }
+  return 0;
+}
+int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) {
+  uint32_t feasign_size = 0;
+  if (_cntls[request_idx]->Failed()) {
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
+                                                  "err:"
+               << _cntls[request_idx]->ErrorText();
+    return -1;
+  }
+  feasign_size = _responses[request_idx].err_code();
+  if (feasign_size < 0) {
+    LOG(ERROR) << "response ret bad, server_idx:" << request_idx
+               << "cmd_id:" << cmd_id
+               << " err_code:" << _responses[request_idx].err_code()
+               << " err_msg:" << _responses[request_idx].err_msg();
+    return -1;
+  }
+  return feasign_size;
+}
+std::string DownpourBrpcClosure::get_response(size_t request_idx, int cmd_id) {
+  std::string data = _responses[request_idx].data();
+  return data;
+}
+std::future<int32_t> BrpcPsClient::print_table_stat(uint32_t table_id) {
+  size_t request_call_num = _server_channels.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [request_call_num, table_id](void *done) {
+        int ret = 0;
+        uint64_t feasign_size = 0;
+        uint64_t mf_size = 0;
+        paddle::framework::BinaryArchive ar;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PRINT_TABLE_STAT) != 0) {
+            ret = -1;
+            break;
+          }
+          std::string resp = closure->get_response(i, PS_PRINT_TABLE_STAT);
+          ar.SetReadBuffer(const_cast<char *>(resp.c_str()), resp.length(),
+                           nullptr);
+          feasign_size += ar.Get<uint64_t>();
+          mf_size += ar.Get<uint64_t>();
+        }
+        closure->set_promise_value(ret);
+        std::cout << "table id: " << table_id
+                  << ", feasign size: " << feasign_size
+                  << ", mf size: " << mf_size << std::endl;
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(PS_PRINT_TABLE_STAT);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    PsService_Stub rpc_stub(get_cmd_channel(i));
+    closure->cntl(i)->set_timeout_ms(
+        10800000);  // cmd msg don't limit timeout for save/load
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::send_cmd(
+    uint32_t table_id, int cmd_id, const std::vector<std::string> &params) {
+  size_t request_call_num = _server_channels.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [request_call_num, cmd_id](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, cmd_id) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(cmd_id);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    for (const auto &param : params) {
+      closure->request(i)->add_params(param);
+    }
+    PsService_Stub rpc_stub(get_cmd_channel(i));
+    closure->cntl(i)->set_timeout_ms(
+        10800000);  // cmd msg don't limit timeout for save/load
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::send_save_cmd(
+    uint32_t table_id, int cmd_id, const std::vector<std::string> &params) {
+  size_t request_call_num = _server_channels.size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [request_call_num, cmd_id](void *done) {
+        int ret = 0;
+        uint32_t feasign_size = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_save_response(i, cmd_id) < 0) {
+            ret = -1;
+            break;
+          }
+          feasign_size += closure->check_save_response(i, cmd_id);
+        }
+        if (ret == 0) {
+          closure->set_promise_value(feasign_size);
+        } else {
+          closure->set_promise_value(ret);
+        }
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(cmd_id);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    for (const auto &param : params) {
+      closure->request(i)->add_params(param);
+    }
+    PsService_Stub rpc_stub(get_cmd_channel(i));
+    closure->cntl(i)->set_timeout_ms(
+        10800000);  // cmd msg don't limit timeout for save/load
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::shrink(uint32_t table_id) {
+  return send_cmd(table_id, PS_SHRINK_TABLE, {std::string("1")});
+}
+std::future<int32_t> BrpcPsClient::load(const std::string &epoch,
+                                        const std::string &mode) {
+  return send_cmd(-1, PS_LOAD_ALL_TABLE, {epoch, mode});
+}
+std::future<int32_t> BrpcPsClient::load(uint32_t table_id,
+                                        const std::string &epoch,
+                                        const std::string &mode) {
+  return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode});
+}
+std::future<int32_t> BrpcPsClient::save(const std::string &epoch,
+                                        const std::string &mode) {
+  return send_save_cmd(-1, PS_SAVE_ALL_TABLE, {epoch, mode});
+}
+std::future<int32_t> BrpcPsClient::save(uint32_t table_id,
+                                        const std::string &epoch,
+                                        const std::string &mode) {
+  return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode});
+}
+std::future<int32_t> BrpcPsClient::clear() {
+  return send_cmd(-1, PS_CLEAR_ALL_TABLE, {});
+}
+std::future<int32_t> BrpcPsClient::clear(uint32_t table_id) {
+  return send_cmd(table_id, PS_CLEAR_ONE_TABLE, {});
+}
+std::future<int32_t> BrpcPsClient::flush() {
+  _flushing = true;
+  std::promise<int> promise;
+  std::future<int32_t> fut = promise.get_future();
+  do {
+    VLOG(3) << "wait _async_call_num:" << _async_call_num;
+    usleep(100000);  // sleep 100ms wait async end
+  } while (_async_call_num > 0);
+  promise.set_value(0);
+  _flushing = false;
+  return fut;
+}
+void BrpcPsClient::finalize_worker() {
+  flush();
+  _running = false;
+  _server.Stop(1000);
+  _server.Join();
+}
+std::future<int32_t> BrpcPsClient::stop_server() {
+  return send_cmd(-1, PS_STOP_SERVER, {});
+}
+std::future<int32_t> BrpcPsClient::start_profiler() {
+  return send_cmd(-1, PS_START_PROFILER, {});
+}
+std::future<int32_t> BrpcPsClient::stop_profiler() {
+  return send_cmd(-1, PS_STOP_PROFILER, {});
+}
+std::future<int32_t> BrpcPsClient::barrier(size_t table_id,
+                                           uint32_t barrier_type) {
+  return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)});
+}
+std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
+                                                  std::vector<float> *values,
+                                                  std::vector<uint64_t> *keys,
+                                                  int pserver_idx) {
+  auto *accessor = table_accessor(table_id);
+  DownpourBrpcClosure *closure =
+      new DownpourBrpcClosure(1, [keys, values, accessor](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        uint32_t shard_nums;
+        if (closure->check_response(0, PS_PULL_GEO_PARAM) != 0) {
+          ret = -1;
+        }
+        auto &res_io_buffer = closure->cntl(0)->response_attachment();
+        butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+        io_buffer_itr.copy_and_forward((void *)(&shard_nums), sizeof(uint32_t));
+        keys->resize(shard_nums);
+        values->resize(shard_nums * accessor->update_dim());
+        io_buffer_itr.copy_and_forward((void *)(keys->data()),
+                                       sizeof(uint64_t) * shard_nums);
+        io_buffer_itr.copy_and_forward((void *)(values->data()),
+                                       shard_nums * accessor->update_size());
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  closure->request(0)->set_cmd_id(PS_PULL_GEO_PARAM);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  PsService_Stub rpc_stub(get_cmd_channel(pserver_idx));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::push_sparse_param(
+    size_t table_id, const uint64_t *keys, const float **update_values,
+    size_t num, void *done) {
+  auto *accessor = table_accessor(table_id);
+  // 发送RPC请求
+  DownpourBrpcClosure *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  size_t request_call_num = _server_channels.size();
+  std::vector<std::vector<uint64_t>> ids;
+  std::vector<std::vector<const float *>> value_ptrs;
+  ids.resize(request_call_num);
+  value_ptrs.resize(request_call_num);
+  for (size_t i = 0; i < num; ++i) {
+    size_t pserver_idx = keys[i] % request_call_num;
+    ids[pserver_idx].push_back(keys[i]);
+    value_ptrs[pserver_idx].push_back(update_values[i]);
+  }
+  for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+    auto kvs = ids[shard_idx];
+    auto value_ptr = value_ptrs[shard_idx];
+    size_t kv_size = kvs.size();
+    uint32_t value_size = accessor->update_size();
+    // 发送RPC请求
+    auto *push_request = closure->request(shard_idx);
+    push_request->set_cmd_id(PS_PUSH_SPARSE_PARAM);
+    push_request->set_table_id(table_id);
+    push_request->set_client_id(_client_id);
+    push_request->add_params((char *)&kv_size, sizeof(uint32_t));
+    auto *push_data = push_request->mutable_data();
+    push_data->resize(kv_size * (sizeof(uint64_t) + accessor->update_size()));
+    char *push_data_ptr = const_cast<char *>(push_data->data());
+    memcpy(push_data_ptr, kvs.data(), kv_size * sizeof(uint64_t));
+    push_data_ptr += kv_size * sizeof(uint64_t);
+    for (int i = 0; i < kv_size; ++i) {
+      memcpy(push_data_ptr, value_ptr[i], accessor->update_size());
+      push_data_ptr += accessor->update_size();
+    }
+    PsService_Stub rpc_stub(get_sparse_channel(shard_idx));
+    closure->cntl(shard_idx)->set_request_compress_type(
+        (brpc::CompressType)FLAGS_pserver_communicate_compress_type);
+    rpc_stub.service(closure->cntl(shard_idx), closure->request(shard_idx),
+                     closure->response(shard_idx), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::pull_dense(Region *regions,
+                                              size_t region_num,
+                                              size_t table_id) {
+  auto *accessor = table_accessor(table_id);
+  size_t request_call_num = _server_channels.size();
+  uint32_t num_per_shard =
+      dense_dim_per_shard(accessor->fea_dim(), request_call_num);
+  // callback 将各shard结果，顺序填入region
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [request_call_num, num_per_shard, regions, region_num,
+                         accessor](void *done) {
+        int ret = 0;
+        size_t region_idx = 0;       // 当前填充的region偏移
+        size_t region_data_idx = 0;  // 当前填充的region内data偏移
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t shard_data_size = num_per_shard * accessor->select_size();
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PULL_DENSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+          auto &res_io_buffer = closure->cntl(i)->response_attachment();
+          butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+          size_t shard_buffer_remain = res_io_buffer.size();
+          if (shard_buffer_remain != shard_data_size) {
+            LOG(ERROR) << "expect res_size:" << shard_data_size
+                       << ", but size:" << shard_buffer_remain
+                       << ", ignore this response";
+            ret = -1;
+            break;
+          }
+          while (shard_buffer_remain > 0 && region_idx < region_num) {
+            auto &region = regions[region_idx];
+            if (region.size - region_data_idx >= shard_buffer_remain) {
+              // region待填充空间 >= 分片buffer数据, 直接拷贝置入
+              io_buffer_itr.copy_and_forward(
+                  (void *)(region.data + region_data_idx), shard_buffer_remain);
+              region_data_idx += shard_buffer_remain;
+              shard_buffer_remain = 0;
+            } else if (region.size - region_data_idx == 0) {
+              // region填满，切换到下一个region
+              ++region_idx;
+              region_data_idx = 0;
+            } else {
+              // region不足以容纳所有数据，则能放多少 拷贝多少
+              io_buffer_itr.copy_and_forward(
+                  (void *)(region.data + region_data_idx),
+                  region.size - region_data_idx);
+              shard_buffer_remain -= (region.size - region_data_idx);
+              ++region_idx;
+              region_data_idx = 0;
+            }
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(PS_PULL_DENSE_TABLE);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    closure->request(i)->add_params((char *)&num_per_shard,
+                                    sizeof(num_per_shard));
+    PsService_Stub rpc_stub(get_dense_channel(i));
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::push_dense_param(const Region *regions,
+                                                    size_t region_num,
+                                                    size_t table_id) {
+  auto *accessor = table_accessor(table_id);
+  size_t request_call_num = _server_channels.size();
+  // 1.拆分Region数据到shard中，后续多shard并行拷贝数据
+  std::vector<std::vector<Region>> regions_partition(request_call_num);
+  uint32_t num_per_shard =
+      dense_dim_per_shard(accessor->fea_dim(), request_call_num);
+  size_t shard_data_size = num_per_shard * accessor->update_size();
+  size_t current_region_idx = 0;
+  size_t current_region_data_idx = 0;
+  for (size_t i = 0; i < request_call_num; ++i) {
+    size_t shard_data_remain_size = shard_data_size;
+    while (shard_data_remain_size > 0 && current_region_idx < region_num) {
+      const auto &region = regions[current_region_idx];
+      size_t region_remain_size = region.size - current_region_data_idx;
+      if (shard_data_remain_size >= region_remain_size) {
+        regions_partition[i].push_back(
+            Region(region.data + current_region_data_idx, region_remain_size));
+        ++current_region_idx;
+        current_region_data_idx = 0;
+        shard_data_remain_size -= region_remain_size;
+      } else {
+        regions_partition[i].push_back(Region(
+            region.data + current_region_data_idx, shard_data_remain_size));
+        current_region_data_idx += shard_data_remain_size;
+        shard_data_remain_size = 0;
+      }
+    }
+  }
+  DownpourBrpcClosure *closure =
+      new DownpourBrpcClosure(request_call_num, [request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PUSH_DENSE_PARAM) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  static const int REGION_ASSIGN_BUFFER_SIZE = 1024 * 10;
+  static char region_assign_buffer[REGION_ASSIGN_BUFFER_SIZE];  //用于数据补齐
+  //开始多shard并行拷贝&请求
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(PS_PUSH_DENSE_PARAM);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    auto &request_buffer = closure->cntl(i)->request_attachment();
+    request_buffer.append((void *)&num_per_shard, sizeof(uint32_t));
+    auto &region_list = regions_partition[i];
+    size_t fill_remain_size = shard_data_size;
+    for (auto &region : region_list) {
+      fill_remain_size -= region.size;
+      request_buffer.append((void *)region.data, region.size);
+    }
+    //保证各分片数据对齐
+    while (fill_remain_size > 0) {
+      size_t fill_num = fill_remain_size > REGION_ASSIGN_BUFFER_SIZE
+                            ? REGION_ASSIGN_BUFFER_SIZE
+                            : fill_remain_size;
+      request_buffer.append((void *)region_assign_buffer, fill_num);
+      fill_remain_size -= fill_num;
+    }
+    PsService_Stub rpc_stub(get_dense_channel(i));
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::push_sparse_raw_gradient(
+    size_t table_id, const uint64_t *keys, const float **update_values,
+    size_t num, void *done) {
+  auto *accessor = table_accessor(table_id);
+  //发送RPC请求
+  DownpourBrpcClosure *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  size_t request_call_num = _server_channels.size();
+  std::vector<std::vector<uint64_t>> ids;
+  std::vector<std::vector<const float *>> value_ptrs;
+  ids.resize(request_call_num);
+  value_ptrs.resize(request_call_num);
+  for (size_t i = 0; i < num; ++i) {
+    size_t pserver_idx = keys[i] % request_call_num;
+    ids[pserver_idx].push_back(keys[i]);
+    value_ptrs[pserver_idx].push_back(update_values[i]);
+  }
+  for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+    auto kvs = ids[shard_idx];
+    auto value_ptr = value_ptrs[shard_idx];
+    size_t kv_size = kvs.size();
+    uint32_t value_size = accessor->update_size();
+    // 发送RPC请求
+    auto *push_request = closure->request(shard_idx);
+    push_request->set_cmd_id(PS_PUSH_SPARSE_TABLE);
+    push_request->set_table_id(table_id);
+    push_request->set_client_id(_client_id);
+    push_request->add_params((char *)&kv_size, sizeof(uint32_t));
+    auto *push_data = push_request->mutable_data();
+    push_data->resize(kv_size * (sizeof(uint64_t) + accessor->update_size()));
+    char *push_data_ptr = const_cast<char *>(push_data->data());
+    memcpy(push_data_ptr, kvs.data(), kv_size * sizeof(uint64_t));
+    push_data_ptr += kv_size * sizeof(uint64_t);
+    for (int i = 0; i < kv_size; ++i) {
+      memcpy(push_data_ptr, value_ptr[i], accessor->update_size());
+      push_data_ptr += accessor->update_size();
+    }
+    PsService_Stub rpc_stub(get_sparse_channel(shard_idx));
+    closure->cntl(shard_idx)->set_request_compress_type(
+        (brpc::CompressType)FLAGS_pserver_communicate_compress_type);
+    rpc_stub.service(closure->cntl(shard_idx), closure->request(shard_idx),
+                     closure->response(shard_idx), closure);
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::push_dense_raw_gradient(
+    int table_id, float *total_send_data, size_t total_send_data_size,
+    void *done) {
+  size_t request_call_num = _server_channels.size();
+  DownpourBrpcClosure *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  auto *accessor = table_accessor(table_id);
+  uint32_t num_per_shard =
+      dense_dim_per_shard(accessor->fea_dim(), request_call_num);
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(PS_PUSH_DENSE_TABLE);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    auto *push_data = closure->request(i)->mutable_data();
+    push_data->clear();
+    push_data->resize(sizeof(uint32_t) + num_per_shard * sizeof(float));
+    char *push_data_ptr = const_cast<char *>(push_data->data());
+    memcpy(push_data_ptr, &num_per_shard, sizeof(uint32_t));
+    memcpy(push_data_ptr + sizeof(uint32_t),
+           total_send_data + i * num_per_shard, num_per_shard * sizeof(float));
+    VLOG(1) << "push_dense_raw_gradient finish memcpy";
+    // closure->cntl(i)->set_request_compress_type(
+    //     (brpc::CompressType)FLAGS_pserver_communicate_compress_type);
+    PsService_Stub rpc_stub(get_dense_channel(i));
+    VLOG(1) << "push_dense_raw_gradient get_dense_channel " << i;
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+    VLOG(1) << "push_dense_raw_gradient async service " << i;
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
+                                               size_t table_id,
+                                               const uint64_t *keys,
+                                               size_t num) {
+  size_t request_call_num = _server_channels.size();
+  auto shard_sorted_kvs = std::make_shared<
+      std::vector<std::vector<std::pair<uint64_t, float *>>>>();
+  shard_sorted_kvs->resize(request_call_num);
+  for (size_t i = 0; i < num; ++i) {
+    size_t shard_id = keys[i] % request_call_num;
+    shard_sorted_kvs->at(shard_id).push_back({keys[i], select_values[i]});
+  }
+  auto *accessor = table_accessor(table_id);
+  size_t value_size = accessor->select_size();
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [shard_sorted_kvs, value_size](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < ids.size(); ++i) {
+          if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+          auto &request_kvs = shard_sorted_kvs->at(i);
+          auto &res_io_buffer = closure->cntl(i)->response_attachment();
+          butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+          uint64_t last_key = UINT64_MAX;
+          float *last_value_data = NULL;
+          for (size_t kv_idx = 0; kv_idx < request_kvs.size(); ++kv_idx) {
+            auto *kv_pair = &(request_kvs[kv_idx]);
+            if (kv_pair->first == last_key) {
+              memcpy((void *)kv_pair->second, (void *)last_value_data,
+                     value_size);
+            } else {
+              last_key = kv_pair->first;
+              last_value_data = kv_pair->second;
+              if (value_size !=
+                  io_buffer_itr.copy_and_forward((void *)(last_value_data),
+                                                 value_size)) {
+                LOG(WARNING) << "res data is lack or not in format";
+                ret = -1;
+                break;
+              }
+            }
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    auto &sorted_kvs = shard_sorted_kvs->at(i);
+    std::sort(sorted_kvs.begin(), sorted_kvs.end(),
+              [](const std::pair<uint64_t, float *> &k1,
+                 const std::pair<uint64_t, float *> &k2) {
+                return k1.first < k2.first;
+              });
+    uint64_t last_key = UINT64_MAX;
+    uint32_t kv_request_count = 0;
+    size_t sorted_kv_size = sorted_kvs.size();
+    auto &request_buffer = closure->cntl(i)->request_attachment();
+    for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
+      ++kv_request_count;
+      last_key = sorted_kvs[kv_idx].first;
+      request_buffer.append((void *)&last_key, sizeof(uint64_t));
+      while (kv_idx < sorted_kv_size - 1 &&
+             last_key == sorted_kvs[kv_idx + 1].first) {
+        ++kv_idx;
+      }
+    }
+    if (kv_request_count == 0) {
+      closure->Run();
+    } else {
+      closure->request(i)->set_cmd_id(PS_PULL_SPARSE_TABLE);
+      closure->request(i)->set_table_id(table_id);
+      closure->request(i)->set_client_id(_client_id);
+      closure->request(i)->add_params((char *)&kv_request_count,
+                                      sizeof(uint32_t));
+      PsService_Stub rpc_stub(get_cmd_channel(i));
+      closure->cntl(i)->set_log_id(butil::gettimeofday_ms());
+      rpc_stub.service(closure->cntl(i), closure->request(i),
+                       closure->response(i), closure);
+    }
+  }
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::send_client2client_msg(
+    int msg_type, int to_client_id, const std::string &msg) {
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  std::future<int> fut = promise->get_future();
+  if (to_client_id >= _client_channels.size()) {
+    LOG(FATAL) << "to_client_id is out of range clients, which size is "
+               << _client_channels.size();
+    promise->set_value(-1);
+    return fut;
+  }
+  auto *closure = new DownpourBrpcClosure(1, [msg_type](void *done) {
+    auto *closure = (DownpourBrpcClosure *)done;
+    int32_t ret = closure->check_response(0, msg_type + 1000);
+    closure->set_promise_value(ret);
+  });
+  closure->add_promise(promise);
+  closure->request(0)->set_cmd_id(msg_type);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->set_data(msg);
+  PsService_Stub rpc_stub(_client_channels[to_client_id].get());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> BrpcPsClient::push_sparse_raw_gradient_partial(
+    size_t table_id, const uint64_t *keys, const float **update_values,
+    uint32_t num, void *done, int pserver_idx) {
+  auto *accessor = table_accessor(table_id);
+  size_t value_size = accessor->update_size();
+  DownpourBrpcClosure *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  // 发送RPC请求
+  auto *push_request = closure->request(0);
+  push_request->set_cmd_id(PS_PUSH_SPARSE_TABLE);
+  push_request->set_table_id(table_id);
+  push_request->set_client_id(_client_id);
+  push_request->add_params((char *)&num, sizeof(uint32_t));
+  auto *push_data = push_request->mutable_data();
+  push_data->resize(num * (sizeof(uint64_t) + value_size));
+  char *push_data_ptr = const_cast<char *>(push_data->data());
+  memcpy(push_data_ptr, keys, num * sizeof(uint64_t));
+  push_data_ptr += num * sizeof(uint64_t);
+  for (int i = 0; i < num; ++i) {
+    memcpy(push_data_ptr, update_values[i], value_size);
+    push_data_ptr += value_size;
+  }
+  PsService_Stub rpc_stub(get_sparse_channel(pserver_idx));
+  closure->cntl(0)->set_request_compress_type(
+      (brpc::CompressType)FLAGS_pserver_communicate_compress_type);
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/service/brpc_ps_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+namespace paddle {
+namespace distributed {
+class DownpourPsClientService : public PsService {
+ public:
+  DownpourPsClientService() {}
+  virtual ~DownpourPsClientService() {}
+  virtual int32_t configure(PSClient *client, size_t rank_id) {
+    _client = client;
+    _rank = rank_id;
+    return 0;
+  }
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const ::paddle::PsRequestMessage *request,
+                       ::paddle::PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+ protected:
+  size_t _rank;
+  PSClient *_client;
+};
+class DownpourBrpcClosure : public PSClientClosure {
+ public:
+  DownpourBrpcClosure(size_t num, PSClientCallBack callback)
+      : PSClientClosure(callback) {
+    _waiting_num = num;
+    _cntls.resize(num);
+    _requests.resize(num);
+    _responses.resize(num);
+    for (size_t i = 0; i < num; ++i) {
+      _cntls[i].reset(new brpc::Controller());
+    }
+  }
+  virtual ~DownpourBrpcClosure() {}
+  virtual void Run() override {
+    if (_waiting_num.fetch_sub(1) == 1) {
+      _callback(this);
+      delete this;
+    }
+  }
+  PsRequestMessage *request(size_t i) { return &_requests[i]; }
+  PsResponseMessage *response(size_t i) { return &_responses[i]; }
+  brpc::Controller *cntl(size_t i) { return _cntls[i].get(); }
+  int check_response(size_t request_idx, int cmd_id);
+  int check_save_response(size_t request_idx, int cmd_id);
+  std::string get_response(size_t request_idx, int cmd_id);
+ private:
+  std::atomic<int32_t> _waiting_num;
+  std::vector<PsRequestMessage> _requests;
+  std::vector<PsResponseMessage> _responses;
+  std::vector<std::shared_ptr<brpc::Controller>> _cntls;
+};
+template <class T>
+struct array_deleter {
+  void operator()(T *&x) const { delete[] x; }
+};
+class BrpcPsClient : public PSClient {
+ public:
+  BrpcPsClient() {}
+  virtual ~BrpcPsClient() {
+    // _running = false;
+    // try {
+    // _async_push_dense_thread.join();
+    // _async_push_sparse_thread.join();
+    //} catch (...) {
+    //}
+  }
+  virtual int32_t create_client2client_connection(
+      int pserver_timeout_ms, int pserver_connect_timeout_ms, int max_retry);
+  virtual std::future<int32_t> shrink(uint32_t table_id) override;
+  virtual std::future<int32_t> load(const std::string &epoch,
+                                    const std::string &mode) override;
+  virtual std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
+                                    const std::string &mode) override;
+  virtual std::future<int32_t> save(const std::string &epoch,
+                                    const std::string &mode) override;
+  virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
+                                    const std::string &mode) override;
+  virtual std::future<int32_t> clear() override;
+  virtual std::future<int32_t> clear(uint32_t table_id) override;
+  virtual std::future<int32_t> stop_server() override;
+  virtual std::future<int32_t> start_profiler() override;
+  virtual std::future<int32_t> stop_profiler() override;
+  virtual void finalize_worker() override;
+  virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
+                                          size_t table_id);
+  virtual std::future<int32_t> push_dense_param(const Region *regions,
+                                                size_t region_num,
+                                                size_t table_id);
+  virtual std::future<int32_t> pull_sparse(float **select_values,
+                                           size_t table_id,
+                                           const uint64_t *keys, size_t num);
+  virtual std::future<int32_t> print_table_stat(uint32_t table_id);
+  virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type);
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float> *values,
+                                              std::vector<uint64_t> *keys,
+                                              int pserver_idx);
+  virtual std::future<int32_t> flush();
+  virtual std::future<int32_t> send_client2client_msg(
+      int msg_type, int to_client_id, const std::string &msg) override;
+ private:
+  virtual int32_t initialize() override;
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+  std::future<int32_t> send_cmd(uint32_t table_id, int cmd_id,
+                                const std::vector<std::string> &param);
+  std::future<int32_t> send_save_cmd(uint32_t table_id, int cmd_id,
+                                     const std::vector<std::string> &param);
+  inline brpc::Channel *get_sparse_channel(size_t server_id) {
+    return _server_channels[server_id][0].get();
+  }
+  inline brpc::Channel *get_dense_channel(size_t server_id) {
+    return _server_channels[server_id][1].get();
+  }
+  inline brpc::Channel *get_cmd_channel(size_t server_id) {
+    return _server_channels[server_id][2].get();
+  }
+  bool _running = false;
+  bool _flushing = false;
+  std::atomic<uint32_t> _async_call_num;  //异步请求计数
+  std::vector<std::shared_ptr<brpc::Channel>>
+      _client_channels;  // client2client
+  std::vector<std::array<std::shared_ptr<brpc::Channel>, 3>>
+      _server_channels;  // client2server
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float *total_send_data, size_t total_send_data_size,
+      void *done) override;
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t *keys, const float **update_values,
+      size_t num, void *done) override;
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t *keys, const float **update_values,
+      uint32_t num, void *done, int pserver_idx) override;
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t *keys,
+                                                 const float **update_values,
+                                                 size_t num,
+                                                 void *done) override;
+  virtual size_t get_server_nums() { return _server_channels.size(); }
+ private:
+  int32_t start_client_service();
+  float _mae = 0;
+  float _mse = 0;
+  uint16_t _push_times = 0;
+  brpc::Server _server;
+  DownpourPsClientService _service;
+  std::atomic_uint grad_num_{0};
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include <thread>  // NOLINT
+#include "Eigen/Dense"
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+int32_t BrpcPsServer::initialize() {
+  auto &service_config = _config.downpour_server_param().service_param();
+  if (!service_config.has_service_class()) {
+    LOG(ERROR) << "miss service_class in ServerServiceParameter";
+    return -1;
+  }
+  auto *service = CREATE_CLASS(PsBaseService, service_config.service_class());
+  if (service == NULL) {
+    LOG(ERROR) << "service is unregistered, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  _service.reset(service);
+  if (service->configure(this) != 0 || service->initialize() != 0) {
+    LOG(ERROR) << "service initialize failed, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(ERROR) << "service add to brpc failed, service:"
+               << service_config.service_class();
+    return -1;
+  }
+  return 0;
+}
+uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  int num_threads = std::thread::hardware_concurrency();
+  brpc::ServerOptions options;
+  options.num_threads = num_threads;
+  if (_server.Start(ip_port.c_str(), &options) != 0) {
+    LOG(ERROR) << "BrpcPsServer start failed, ip_port=" << ip_port;
+    return 0;
+  }
+  VLOG(0) << "BrpcPsServer::start registe_ps_server";
+  _environment->registe_ps_server(ip, port, _rank);
+  VLOG(0) << "BrpcPsServer::start wait";
+  cv_.wait(lock, [&] { return stoped_; });
+  PSHost host;
+  host.ip = ip;
+  host.port = port;
+  host.rank = _rank;
+  VLOG(0) << "BrpcPsServer::start return host.rank";
+  return host.rank;
+}
+int32_t BrpcPsServer::port() { return _server.listen_address().port; }
+int32_t PsService::initialize() {
+  _is_initialize_shard_info = false;
+  _service_handler_map[PS_STOP_SERVER] = &PsService::stop_server;
+  _service_handler_map[PS_PULL_DENSE_TABLE] = &PsService::pull_dense;
+  _service_handler_map[PS_PUSH_DENSE_TABLE] = &PsService::push_dense;
+  _service_handler_map[PS_PULL_SPARSE_TABLE] = &PsService::pull_sparse;
+  _service_handler_map[PS_PUSH_SPARSE_TABLE] = &PsService::push_sparse;
+  _service_handler_map[PS_SAVE_ONE_TABLE] = &PsService::save_one_table;
+  _service_handler_map[PS_SAVE_ALL_TABLE] = &PsService::save_all_table;
+  _service_handler_map[PS_SHRINK_TABLE] = &PsService::shrink_table;
+  _service_handler_map[PS_LOAD_ONE_TABLE] = &PsService::load_one_table;
+  _service_handler_map[PS_LOAD_ALL_TABLE] = &PsService::load_all_table;
+  _service_handler_map[PS_CLEAR_ONE_TABLE] = &PsService::clear_one_table;
+  _service_handler_map[PS_CLEAR_ALL_TABLE] = &PsService::clear_all_table;
+  _service_handler_map[PS_PUSH_DENSE_PARAM] = &PsService::push_dense_param;
+  _service_handler_map[PS_PRINT_TABLE_STAT] = &PsService::print_table_stat;
+  _service_handler_map[PS_PULL_GEO_PARAM] = &PsService::pull_geo_param;
+  _service_handler_map[PS_PUSH_SPARSE_PARAM] = &PsService::push_sparse_param;
+  _service_handler_map[PS_BARRIER] = &PsService::barrier;
+  _service_handler_map[PS_START_PROFILER] = &PsService::start_profiler;
+  _service_handler_map[PS_STOP_PROFILER] = &PsService::stop_profiler;
+  // shard初始化,server启动后才可从env获取到server_list的shard信息
+  initialize_shard_info();
+  return 0;
+}
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+int32_t PsService::initialize_shard_info() {
+  if (!_is_initialize_shard_info) {
+    std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
+    if (_is_initialize_shard_info) {
+      return 0;
+    }
+    size_t shard_num = _server->environment()->get_ps_servers().size();
+    auto &table_map = *(_server->table());
+    for (auto itr : table_map) {
+      itr.second->set_shard(_rank, shard_num);
+    }
+    _is_initialize_shard_info = true;
+  }
+  return 0;
+}
+void PsService::service(google::protobuf::RpcController *cntl_base,
+                        const PsRequestMessage *request,
+                        PsResponseMessage *response,
+                        google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  std::string log_label("ReceiveCmd-");
+  if (!request->has_table_id()) {
+    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    return;
+  }
+  response->set_err_code(0);
+  response->set_err_msg("");
+  auto *table = _server->table(request->table_id());
+  brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+  auto itr = _service_handler_map.find(request->cmd_id());
+  if (itr == _service_handler_map.end()) {
+    std::string err_msg(
+        "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+    err_msg.append(std::to_string(request->cmd_id()));
+    set_response_code(*response, -1, err_msg.c_str());
+    return;
+  }
+  serviceHandlerFunc handler_func = itr->second;
+  int service_ret = (this->*handler_func)(table, *request, *response, cntl);
+  if (service_ret != 0) {
+    response->set_err_code(service_ret);
+    response->set_err_msg("server internal error");
+  }
+}
+int32_t PsService::pull_dense(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->pull_dense");
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 1 for num of dense");
+    return 0;
+  }
+  uint32_t num = *(const uint32_t *)request.params(0).c_str();
+  if (num < 0) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.datas[0] is invalid, num must >= 0");
+    return 0;
+  }
+  std::vector<float> res_data;
+  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_dense(res_data.data(), num);
+  cntl->response_attachment().append((char *)res_data.data(),
+                                     res_data.size() * sizeof(float));
+  return 0;
+}
+int32_t PsService::push_dense_param(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->push_dense_param");
+  CHECK_TABLE_EXIST(table, request, response)
+  thread_local std::string push_buffer;
+  auto &req_io_buffer = cntl->request_attachment();
+  auto req_buffer_size = req_io_buffer.size();
+  if (req_buffer_size < 1) {
+    set_response_code(response, -1, "req attachment is empty");
+    return 0;
+  }
+  push_buffer.resize(0);
+  push_buffer.reserve(req_buffer_size);
+  const char *data = (const char *)cntl->request_attachment().fetch(
+      const_cast<char *>(push_buffer.data()), req_buffer_size);
+  uint32_t num = *(const uint32_t *)data;
+  const float *values = (const float *)(data + sizeof(uint32_t));
+  if (table->push_dense_param(values, num) != 0) {
+    set_response_code(response, -1, "push_dense_param failed");
+  }
+  return 0;
+}
+int32_t PsService::push_dense(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->push_dense");
+  CHECK_TABLE_EXIST(table, request, response)
+  auto req_buffer_size = request.data().size();
+  if (req_buffer_size < 1) {
+    // set_response_code(response, 0, "push dense data is empty");
+    return 0;
+  }
+  /*
+  Push Content:
+  |--num--|---valuesData---|
+  |--4B---|----------------|
+  */
+  uint32_t num = *(const uint32_t *)(request.data().data());
+  const float *values =
+      (const float *)(request.data().data() + sizeof(uint32_t));
+  if (table->push_dense(values, num) != 0) {
+    set_response_code(response, -1, "push_dense failed");
+  }
+  return 0;
+}
+int32_t PsService::barrier(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response,
+                           brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+  auto trainer_id = request.client_id();
+  auto barrier_type = request.params(0);
+  table->barrier(trainer_id, barrier_type);
+  return 0;
+}
+int32_t PsService::push_sparse_param(Table *table,
+                                     const PsRequestMessage &request,
+                                     PsResponseMessage &response,
+                                     brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->push_sparse_param");
+  CHECK_TABLE_EXIST(table, request, response)
+  auto &push_data = request.data();
+  if (push_data.size() < 1) {
+    // set_response_code(response, 0, "push sparse data is empty");
+    return 0;
+  }
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+  uint32_t num = *(uint32_t *)(request.params(0).c_str());
+  /*
+  Push Content:
+  |---keysData---|---valuesData---|
+  |---8*{num}B---|----------------|
+  */
+  const uint64_t *keys = (const uint64_t *)push_data.data();
+  const float *values =
+      (const float *)(push_data.data() + sizeof(uint64_t) * num);
+  if (table->push_sparse_param(keys, values, num) != 0) {
+    set_response_code(response, -1, "push_sparse_param error");
+  }
+  return 0;
+}
+int32_t PsService::pull_geo_param(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->pull_geo_param");
+  CHECK_TABLE_EXIST(table, request, response)
+  thread_local std::string push_sparse_request_buffer;
+  auto trainer_id = request.client_id();
+  std::vector<float> values;
+  std::vector<uint64_t> ids;
+  table->pull_geo_param(trainer_id, &values, &ids);
+  uint32_t num = ids.size();
+  cntl->response_attachment().append((char *)(&num), sizeof(uint32_t));
+  cntl->response_attachment().append((char *)ids.data(),
+                                     ids.size() * sizeof(uint64_t));
+  cntl->response_attachment().append((char *)values.data(),
+                                     values.size() * sizeof(float));
+  return 0;
+}
+int32_t PsService::pull_sparse(Table *table, const PsRequestMessage &request,
+                               PsResponseMessage &response,
+                               brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->pull_sparse");
+  CHECK_TABLE_EXIST(table, request, response)
+  thread_local std::string push_sparse_request_buffer;
+  auto &req_io_buffer = cntl->request_attachment();
+  auto req_buffer_size = req_io_buffer.size();
+  if (req_buffer_size < 1) {
+    set_response_code(response, -1, "req attachment is empty");
+    return 0;
+  }
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+  uint32_t num = *(uint32_t *)(request.params(0).c_str());
+  push_sparse_request_buffer.resize(0);
+  push_sparse_request_buffer.reserve(req_buffer_size);
+  const char *data = (const char *)cntl->request_attachment().fetch(
+      const_cast<char *>(push_sparse_request_buffer.data()), req_buffer_size);
+  /*
+  Attachment Content:
+  |---keysData---|
+  |---8*{num}B---|
+  */
+  const uint64_t *keys = (const uint64_t *)data;
+  std::vector<float> res_data;
+  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_sparse(res_data.data(), keys, num);
+  cntl->response_attachment().append((char *)res_data.data(),
+                                     res_data.size() * sizeof(float));
+  return 0;
+}
+int32_t PsService::push_sparse(Table *table, const PsRequestMessage &request,
+                               PsResponseMessage &response,
+                               brpc::Controller *cntl) {
+  platform::RecordEvent record_event("PsService->push_sparse");
+  CHECK_TABLE_EXIST(table, request, response)
+  auto &push_data = request.data();
+  if (push_data.size() < 1) {
+    // set_response_code(response, 0, "push sparse data is empty");
+    return 0;
+  }
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+  uint32_t num = *(uint32_t *)(request.params(0).c_str());
+  /*
+  Push Content:
+  |---keysData---|---valuesData---|
+  |---8*{num}B---|----------------|
+  */
+  const uint64_t *keys = (const uint64_t *)push_data.data();
+  const float *values =
+      (const float *)(push_data.data() + sizeof(uint64_t) * num);
+  if (table->push_sparse(keys, values, num) != 0) {
+    set_response_code(response, -1, "push_sparse error");
+  }
+  return 0;
+}
+int32_t PsService::print_table_stat(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  std::pair<int64_t, int64_t> ret = table->print_table_stat();
+  paddle::framework::BinaryArchive ar;
+  ar << ret.first << ret.second;
+  std::string table_info(ar.Buffer(), ar.Length());
+  response.set_data(table_info);
+  return 0;
+}
+int32_t PsService::load_one_table(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+    return -1;
+  }
+  if (table->load(request.params(0), request.params(1)) != 0) {
+    set_response_code(response, -1, "table load failed");
+    return -1;
+  }
+  return 0;
+}
+int32_t PsService::load_all_table(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (load_one_table(itr.second.get(), request, response, cntl) != 0) {
+      LOG(ERROR) << "load table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+int32_t PsService::save_one_table(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2, path&mode");
+    return -1;
+  }
+  table->flush();
+  int32_t feasign_size = 0;
+  feasign_size = table->save(request.params(0), request.params(1));
+  if (feasign_size < 0) {
+    set_response_code(response, -1, "table save failed");
+    return -1;
+  }
+  return feasign_size;
+}
+int32_t PsService::save_all_table(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  int32_t all_feasign_size = 0;
+  int32_t feasign_size = 0;
+  for (auto &itr : table_map) {
+    feasign_size = save_one_table(itr.second.get(), request, response, cntl);
+    if (feasign_size < 0) {
+      LOG(ERROR) << "save table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+int32_t PsService::shrink_table(Table *table, const PsRequestMessage &request,
+                                PsResponseMessage &response,
+                                brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  table->flush();
+  if (table->shrink() != 0) {
+    set_response_code(response, -1, "table shrink failed");
+  }
+  return 0;
+}
+int32_t PsService::clear_one_table(Table *table,
+                                   const PsRequestMessage &request,
+                                   PsResponseMessage &response,
+                                   brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  table->flush();
+  table->clear();
+  return 0;
+}
+int32_t PsService::clear_all_table(Table *table,
+                                   const PsRequestMessage &request,
+                                   PsResponseMessage &response,
+                                   brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (clear_one_table(itr.second.get(), request, response, cntl) != 0) {
+      return -1;
+    }
+  }
+  return 0;
+}
+int32_t PsService::stop_server(Table *table, const PsRequestMessage &request,
+                               PsResponseMessage &response,
+                               brpc::Controller *cntl) {
+  auto *p_server = _server;
+  std::thread t_stop([p_server]() {
+    p_server->stop();
+    LOG(INFO) << "Server Stoped";
+  });
+  t_stop.detach();
+  return 0;
+}
+int32_t PsService::stop_profiler(Table *table, const PsRequestMessage &request,
+                                 PsResponseMessage &response,
+                                 brpc::Controller *cntl) {
+  platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                            string::Sprintf("server_%s_profile", _rank));
+  return 0;
+}
+int32_t PsService::start_profiler(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kCPU);
+  return 0;
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/brpc_ps_server.h
+++ b/paddle/fluid/distributed/service/brpc_ps_server.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/distributed/service/server.h"
+namespace paddle {
+namespace distributed {
+class BrpcPsServer : public PSServer {
+ public:
+  BrpcPsServer() {}
+  virtual ~BrpcPsServer() {}
+  virtual uint64_t start(const std::string &ip, uint32_t port);
+  virtual int32_t stop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    stoped_ = true;
+    cv_.notify_all();
+    _server.Stop(1000);
+    _server.Join();
+    return 0;
+  }
+  virtual int32_t port();
+ private:
+  virtual int32_t initialize();
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stoped_ = false;
+  brpc::Server _server;
+  std::shared_ptr<PsBaseService> _service;
+  std::vector<std::shared_ptr<brpc::Channel>> _pserver_channels;
+};
+class PsService;
+typedef int32_t (PsService::*serviceHandlerFunc)(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl);
+class PsService : public PsBaseService {
+ public:
+  virtual int32_t initialize() override;
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const ::paddle::PsRequestMessage *request,
+                       ::paddle::PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+ private:
+  int32_t initialize_shard_info();
+  int32_t pull_dense(Table *table, const PsRequestMessage &request,
+                     PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t push_dense(Table *table, const PsRequestMessage &request,
+                     PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t push_dense_param(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t push_sparse_param(Table *table, const PsRequestMessage &request,
+                            PsResponseMessage &response,
+                            brpc::Controller *cntl);
+  int32_t pull_sparse(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t pull_geo_param(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t barrier(Table *table, const PsRequestMessage &request,
+                  PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t push_sparse(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t save_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t save_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t shrink_table(Table *table, const PsRequestMessage &request,
+                       PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t clear_one_table(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t clear_all_table(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_server(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t start_profiler(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_profiler(Table *table, const PsRequestMessage &request,
+                        PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t print_table_stat(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+  bool _is_initialize_shard_info;
+  std::mutex _initialize_shard_mutex;
+  std::unordered_map<int32_t, serviceHandlerFunc> _service_handler_map;
+  std::unordered_map<int32_t, serviceHandlerFunc> _msg_handler_map;
+  std::vector<float> _ori_values;
+};
+class DownpourPServerBrpcClosure : public PServerClosure {
+ public:
+  DownpourPServerBrpcClosure(size_t num, PServerCallBack callback)
+      : PServerClosure(callback) {
+    _waiting_num = num;
+    _cntls.resize(num);
+    _requests.resize(num);
+    _responses.resize(num);
+    for (size_t i = 0; i < num; ++i) {
+      _cntls[i].reset(new brpc::Controller());
+    }
+  }
+  virtual ~DownpourPServerBrpcClosure() {}
+  virtual void Run() override {
+    if (_waiting_num.fetch_sub(1) == 1) {
+      _callback(this);
+      delete this;
+    }
+  }
+  PsRequestMessage *request(size_t i) { return &_requests[i]; }
+  PsResponseMessage *response(size_t i) { return &_responses[i]; }
+  brpc::Controller *cntl(size_t i) { return _cntls[i].get(); }
+  int check_response(size_t request_idx, int cmd_id) { return 1; }
+  int check_save_response(size_t request_idx, int cmd_id) { return 1; }
+ private:
+  std::atomic<int32_t> _waiting_num;
+  std::vector<PsRequestMessage> _requests;
+  std::vector<PsResponseMessage> _responses;
+  std::vector<std::shared_ptr<brpc::Controller>> _cntls;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include <limits>
+#include <memory>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+namespace paddle {
+namespace distributed {
+framework::proto::VarType::Type VarMessageToVarType(
+    VariableMessage::Type type) {
+  switch (type) {
+    case VariableMessage::FP32:
+      return framework::proto::VarType::FP32;  // NOLINT
+    case VariableMessage::FP64:
+      return framework::proto::VarType::FP64;  // NOLINT
+    case VariableMessage::INT32:
+      return framework::proto::VarType::INT32;  // NOLINT
+    case VariableMessage::INT64:
+      return framework::proto::VarType::INT64;  // NOLINT
+    case VariableMessage::BOOL:
+      return framework::proto::VarType::BOOL;  // NOLINT
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "VarMessageToVarType:Unsupported type %d", type));
+  }
+}
+void SerializeToMultiVarMsgAndIOBuf(
+    const std::string& message_name,
+    const std::vector<std::string>& send_var_name_val,
+    const std::vector<std::string>& recv_var_name_val,
+    const platform::DeviceContext& ctx, const framework::Scope* scope,
+    MultiVarMsg* request, butil::IOBuf* iobuf) {
+  // 1. message_name
+  request->set_message_name(message_name);
+  // 2. var_names
+  for (auto& send_var_name : send_var_name_val) {
+    request->add_send_var_names(send_var_name);
+  }
+  for (auto& recv_var_name : recv_var_name_val) {
+    request->add_recv_var_names(recv_var_name);
+  }
+  // 3. VarMessage
+  for (auto& send_var_name : send_var_name_val) {
+    auto* send_var_msg = request->add_var_messages();
+    butil::IOBuf temp_iobuf;
+    send_var_msg->set_varname(send_var_name);
+    framework::Variable* var = scope->FindVar(send_var_name);
+    if (var->IsType<framework::LoDTensor>()) {
+      SerializeLodTensor(var, ctx, send_var_msg, &temp_iobuf);
+    } else if (var->IsType<framework::SelectedRows>()) {
+      SerializeSelectedRows(var, ctx, send_var_msg, &temp_iobuf);
+    }
+    iobuf->append(temp_iobuf);
+  }
+}
+void SerializeLodTensor(framework::Variable* var,
+                        const platform::DeviceContext& ctx, VarMsg* var_msg,
+                        butil::IOBuf* iobuf) {
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  var_msg->set_type(::paddle::LOD_TENSOR);
+  const framework::LoD lod = tensor->lod();
+  if (lod.size() > 0) {
+    var_msg->set_lod_level(lod.size());
+    for (auto& each : lod) {
+      VarMsg::LodData* lod_inner = var_msg->add_lod();
+      for (auto& d : each) {
+        lod_inner->add_lod_data(d);
+      }
+    }
+  }
+  var_msg->set_data_type(static_cast<VarMsg::Type>(tensor->type()));
+  for (auto& dim : framework::vectorize(tensor->dims())) {
+    var_msg->add_dims(dim);
+  }
+  // IO Buffer
+  if (platform::is_cpu_place(tensor->place())) {
+    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
+    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
+                  data_len);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    char* temp_ptr =
+        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(platform::CPUPlace(), temp_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * framework::SizeOfType(tensor->type()),
+                 stream);
+    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
+    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
+    iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
+    delete[] temp_ptr;
+#endif
+  }
+}
+void SerializeSelectedRows(framework::Variable* var,
+                           const platform::DeviceContext& ctx, VarMsg* var_msg,
+                           butil::IOBuf* iobuf) {
+  framework::SelectedRows* slr = var->GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  var_msg->set_type(::paddle::SELECTED_ROWS);
+  var_msg->set_slr_height(slr->height());
+  auto* var_data = var_msg->mutable_data();
+  var_data->clear();
+  var_data->resize(rows->size() * sizeof(int64_t));
+  char* data_ptr = const_cast<char*>(var_data->data());
+  if (platform::is_cpu_place(tensor->place())) {
+    memcpy(data_ptr, &(*rows)[0], rows->size() * sizeof(int64_t));
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+                 &(*rows)[0], rows->size() * sizeof(int64_t), stream);
+#endif
+  }
+  var_msg->set_data_type(static_cast<VarMsg::Type>(tensor->type()));
+  for (auto& dim : framework::vectorize(tensor->dims())) {
+    var_msg->add_dims(dim);
+  }
+  // IO Buffer
+  if (platform::is_cpu_place(tensor->place())) {
+    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
+    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
+                  data_len);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    char* temp_ptr =
+        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(platform::CPUPlace(), temp_ptr,
+                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * framework::SizeOfType(tensor->type()),
+                 stream);
+    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
+    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
+    iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
+    delete[] temp_ptr;
+#endif
+  }
+}
+void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
+                                        const butil::IOBuf* iobuf,
+                                        const platform::DeviceContext& ctx,
+                                        framework::Scope* scope) {
+  butil::IOBufBytesIterator io_buffer_itr(*iobuf);
+  // size_t shard_buffer_remain = res_io_buffer.size();
+  for (int recv_var_index = 0; recv_var_index < multi_msg.send_var_names_size();
+       ++recv_var_index) {
+    const auto& msg = multi_msg.var_messages(recv_var_index);
+    auto* var = scope->Var(msg.varname());
+    if (msg.type() == ::paddle::LOD_TENSOR) {
+      DeserializeLodTensor(var, msg, io_buffer_itr, ctx);
+    } else if (msg.type() == ::paddle::SELECTED_ROWS) {
+      DeserializeSelectedRows(var, msg, io_buffer_itr, ctx);
+    }
+  }
+}
+void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
+                                        const butil::IOBuf* iobuf,
+                                        const platform::DeviceContext& ctx,
+                                        const framework::Scope* scope) {
+  butil::IOBufBytesIterator io_buffer_itr(*iobuf);
+  // size_t shard_buffer_remain = res_io_buffer.size();
+  for (int recv_var_index = 0; recv_var_index < multi_msg.send_var_names_size();
+       ++recv_var_index) {
+    const auto& msg = multi_msg.var_messages(recv_var_index);
+    auto* var = scope->FindVar(msg.varname());
+    PADDLE_ENFORCE_NE(var, nullptr,
+                      platform::errors::InvalidArgument(
+                          "Not find variable %s in scope.", msg.varname()));
+    if (msg.type() == ::paddle::LOD_TENSOR) {
+      DeserializeLodTensor(var, msg, io_buffer_itr, ctx);
+    } else if (msg.type() == ::paddle::SELECTED_ROWS) {
+      DeserializeSelectedRows(var, msg, io_buffer_itr, ctx);
+    }
+  }
+}
+void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
+                          butil::IOBufBytesIterator& io_buffer_itr,
+                          const platform::DeviceContext& ctx) {
+  const auto place = ctx.GetPlace();
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+  std::vector<int> vec_dim;
+  for (auto& x : msg.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(framework::make_ddim(vec_dim));
+  framework::LoD lod;
+  for (int i = 0; i < msg.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < msg.lod(i).lod_data_size(); ++j) {
+      v.push_back(msg.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+  void* tensor_data =
+      tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
+  // IO Buffer
+  if (platform::is_cpu_place(place)) {
+    unsigned long data_len;
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    io_buffer_itr.copy_and_forward(tensor_data, data_len);
+  } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    unsigned long data_len;
+    char* temp_ptr =
+        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+                 platform::CPUPlace(), (void*)temp_ptr,
+                 tensor->numel() * framework::SizeOfType(tensor->type()),
+                 stream);
+    delete[] temp_ptr;
+#endif
+  }
+}
+void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
+                             butil::IOBufBytesIterator& io_buffer_itr,
+                             const platform::DeviceContext& ctx) {
+  const auto place = ctx.GetPlace();
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  framework::Tensor* tensor = slr->mutable_value();
+  slr->set_height(msg.slr_height());
+  std::vector<int64_t> tmp_rows(msg.slr_height());
+  memcpy(&tmp_rows[0], msg.data().data(), msg.slr_height() * sizeof(int64_t));
+  slr->set_rows(tmp_rows);
+  std::vector<int> vec_dim;
+  for (auto& x : msg.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(framework::make_ddim(vec_dim));
+  void* tensor_data =
+      tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
+  // IO Buffer
+  if (platform::is_cpu_place(place)) {
+    unsigned long data_len;
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    io_buffer_itr.copy_and_forward(tensor_data, data_len);
+  } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    char* temp_ptr =
+        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    unsigned long data_len;
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    io_buffer_itr.copy_and_forward(temp_ptr, data_len);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+                 platform::CPUPlace(), temp_ptr,
+                 tensor->numel() * framework::SizeOfType(tensor->type()),
+                 stream);
+    delete[] temp_ptr;
+#endif
+  }
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/service/brpc_utils.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+#include "brpc/channel.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/port.h"
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+namespace paddle {
+namespace distributed {
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+void SerializeToMultiVarMsgAndIOBuf(
+    const std::string& message_name,
+    const std::vector<std::string>& send_var_name_val,
+    const std::vector<std::string>& recv_var_name_val,
+    const platform::DeviceContext& ctx, const framework::Scope* scope,
+    MultiVarMsg* var_msg, butil::IOBuf* iobuf);
+void SerializeLodTensor(framework::Variable* var,
+                        const platform::DeviceContext& ctx, VarMsg* var_msg,
+                        butil::IOBuf* iobuf);
+void SerializeSelectedRows(framework::Variable* var,
+                           const platform::DeviceContext& ctx, VarMsg* request,
+                           butil::IOBuf* iobuf);
+// Deserialize for Server
+void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
+                                        const butil::IOBuf* iobuf,
+                                        const platform::DeviceContext& ctx,
+                                        framework::Scope* scope);
+// Deserialize for Client
+void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
+                                        const butil::IOBuf* iobuf,
+                                        const platform::DeviceContext& ctx,
+                                        const framework::Scope* scope);
+void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
+                          butil::IOBufBytesIterator& iobuf,
+                          const platform::DeviceContext& ctx);
+void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
+                             butil::IOBufBytesIterator& iobuf,
+                             const platform::DeviceContext& ctx);
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/distributed/service/communicator.h"
+#include <google/protobuf/text_format.h>
+#include "paddle/fluid/distributed/table/table.h"
+#include <gflags/gflags.h>
+#include <paddle/fluid/framework/program_desc.h>
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <map>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/split.h"
+namespace paddle {
+namespace distributed {
+using framework::LoDTensor;
+using framework::SelectedRows;
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+Communicator::Communicator() {}
+void Communicator::init_gflag(const std::string &gflags) {
+  VLOG(0) << "Init With Gflags:" << gflags;
+  std::vector<std::string> flags = paddle::string::split_string(gflags);
+  if (flags.size() < 1) {
+    flags.push_back("-max_body_size=314217728");
+    flags.push_back("-bthread_concurrency=40");
+    flags.push_back("-socket_max_unwritten_bytes=2048000000");
+    flags.push_back("-max_connection_pool_size=1950");
+  }
+  auto it = flags.begin();
+  flags.insert(it, "exe default");
+  char *flags_ptr[flags.size()];
+  for (size_t i = 0; i < flags.size(); ++i) {
+    flags_ptr[i] = (char *)(flags[i].c_str());
+  }
+  int params_cnt = flags.size();
+  char **params_ptr = &(flags_ptr[0]);
+  ::google::ParseCommandLineFlags(&params_cnt, &params_ptr, true);
+}
+std::once_flag Communicator::init_flag_;
+std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
+void Communicator::InitBrpcClient(
+    const std::string &dist_desc,
+    const std::vector<std::string> &host_sign_list) {
+  // not used, just for psclient's init
+  std::map<uint64_t, std::vector<paddle::distributed::Region>>
+      _dense_pull_regions;
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto tid = iter.first;
+    auto var_names = iter.second;
+    auto &regions = _dense_pull_regions[tid];
+    regions.reserve(var_names.size());
+    for (auto &t : var_names) {
+      Variable *var = recv_scope_->FindVar(t);
+      LoDTensor *tensor = var->GetMutable<LoDTensor>();
+      float *w = tensor->data<float>();
+      paddle::distributed::Region reg(w, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    }
+  }
+  if (_worker_ptr.get() == nullptr) {
+    google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
+    init_gflag(_ps_param.init_gflags());
+    servers_ = host_sign_list.size();
+    _ps_env = paddle::distributed::PaddlePSEnvironment();
+    _ps_env.set_ps_servers(&host_sign_list, servers_);
+    _worker_ptr = std::shared_ptr<paddle::distributed::PSClient>(
+        paddle::distributed::PSClientFactory::create(_ps_param));
+    _worker_ptr->configure(_ps_param, _dense_pull_regions, _ps_env,
+                           trainer_id_);
+  }
+  return;
+}
+void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
+                                int table_id, Scope *scope) {
+  platform::RecordEvent record_event("Communicator->RpcRecvDense");
+  std::vector<paddle::distributed::Region> regions;
+  regions.reserve(varnames.size());
+  for (auto &t : varnames) {
+    Variable *var = scope->Var(t);
+    LoDTensor *tensor = var->GetMutable<LoDTensor>();
+    if (platform::is_gpu_place(tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
+      Variable *temp_var = xpu_temp_scope_->Var(t);
+      LoDTensor *temp_tensor = temp_var->GetMutable<LoDTensor>();
+      temp_tensor->Resize(tensor->dims());
+      float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
+      paddle::distributed::Region reg(temp_data, tensor->numel());
+      regions.emplace_back(std::move(reg));
+      VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
+              << table_id << " Temp_data[0] " << temp_data[0]
+              << " Temp_data[-1] " << temp_data[tensor->numel() - 1];
+#endif
+    } else {
+      float *w = tensor->mutable_data<float>(tensor->place());
+      paddle::distributed::Region reg(w, tensor->numel());
+      regions.emplace_back(std::move(reg));
+    }
+  }
+  auto status =
+      _worker_ptr->pull_dense(regions.data(), regions.size(), table_id);
+  status.wait();
+  for (auto &t : varnames) {
+    Variable *var = scope->FindVar(t);
+    LoDTensor *tensor = var->GetMutable<LoDTensor>();
+    VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
+            << platform::is_gpu_place(tensor->place());
+    if (platform::is_gpu_place(tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
+      LoDTensor *temp_tensor =
+          xpu_temp_scope_->FindVar(t)->GetMutable<LoDTensor>();
+      framework::TensorCopy(*temp_tensor, tensor->place(), tensor);
+      float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
+      VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
+              << table_id << " Temp_data[0] " << temp_data[0]
+              << " Temp_data[-1] " << temp_data[tensor->numel() - 1];
+#endif
+    }
+  }
+  return;
+}
+void Communicator::RpcSendDenseParam(const std::vector<std::string> &varnames,
+                                     int table_id, const Scope &scope) {
+  platform::RecordEvent record_event("Communicator->RpcSendDenseParam");
+  auto place = platform::CPUPlace();
+  std::vector<paddle::distributed::Region> regions;
+  for (auto &t : varnames) {
+    Variable *var = scope.FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
+    LoDTensor *tensor = var->GetMutable<LoDTensor>();
+    if (platform::is_gpu_place(tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
+      Variable *temp_var = xpu_temp_scope_->Var(t);
+      LoDTensor *temp_tensor = temp_var->GetMutable<LoDTensor>();
+      temp_tensor->Resize(tensor->dims());
+      float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
+      framework::TensorCopy(*tensor, platform::CPUPlace(), temp_tensor);
+      paddle::distributed::Region reg(temp_data, tensor->numel());
+      regions.emplace_back(std::move(reg));
+      VLOG(1) << "AsyncCommunicator::RpcSendDenseParam Var " << t
+              << " table_id " << table_id << " Temp_data[0] " << temp_data[0]
+              << " Temp_data[-1] " << temp_data[tensor->numel() - 1];
+#endif
+    } else {
+      float *w = tensor->mutable_data<float>(place);
+      paddle::distributed::Region reg(w, tensor->numel());
+      regions.emplace_back(std::move(reg));
+      VLOG(1) << "AsyncCommunicator::RpcSendDenseParam Var " << t
+              << " talbe_id " << table_id << " Temp_data[0] " << w[0]
+              << " Temp_data[-1] " << w[tensor->numel() - 1];
+    }
+  }
+  auto status =
+      _worker_ptr->push_dense_param(regions.data(), regions.size(), table_id);
+  status.wait();
+  VLOG(4) << "RPC Send Dense Param " << table_id << " done!";
+  return;
+}
+void Communicator::RpcSendDense(const CommContext &ctx, const Scope &scope) {
+  platform::RecordEvent record_event("Communicator->RpcSendDense");
+  auto &var_names = ctx.origin_varnames;
+  auto &table_id = ctx.table_id;
+  auto dense_data = std::make_shared<std::vector<float>>();
+  size_t request_call_num = _worker_ptr->get_server_nums();
+  uint32_t num_per_shard =
+      dense_dim_per_shard(ctx.height_sections[0], request_call_num);
+  dense_data->resize(num_per_shard *
+                     request_call_num);  // accessor->update_dim() = 1
+  float *data = dense_data->data();
+  uint32_t pos = 0;
+  for (size_t i = 0; i < var_names.size(); ++i) {
+    const LoDTensor tensor = scope.FindVar(var_names[i])->Get<LoDTensor>();
+    size_t count = static_cast<size_t>(tensor.numel());
+    const float *g = tensor.data<float>();
+    CHECK(pos + count <= dense_data->size())
+        << "invalid dense size, cur pos[" << pos << "]"
+        << " data_num[" << count << "] size[" << dense_data->size() << "]";
+    memcpy(data + pos, g, count * sizeof(float));
+    pos += count;
+  }
+  ++_async_call_num;
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [this, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PUSH_DENSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+        --_async_call_num;
+      });
+  auto status = _worker_ptr->push_dense_raw_gradient(
+      table_id, data, dense_data->size(), closure);
+  status.wait();
+  return;
+}
+void Communicator::RpcSendSparseParam(const std::string &varname, int table_id,
+                                      const Scope &scope) {
+  platform::RecordEvent record_event("Communicator->RpcSendSparseParam");
+  size_t request_call_num = _worker_ptr->get_server_nums();
+  std::vector<float *> push_g_vec;
+  auto *send_var = scope.FindVar(varname);
+  auto *tensor = send_var->GetMutable<framework::LoDTensor>();
+  auto dim = tensor->dims()[1];
+  uint64_t sparse_num = static_cast<uint64_t>(tensor->dims()[0]);
+  std::vector<uint64_t> sparse_push_keys(sparse_num);
+  std::iota(sparse_push_keys.begin(), sparse_push_keys.end(), 0);
+  push_g_vec.reserve(sparse_num);
+  for (auto i = 0; i < static_cast<int>(sparse_push_keys.size()); ++i) {
+    push_g_vec.push_back(tensor->data<float>() + i * dim);
+  }
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [this, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PUSH_SPARSE_PARAM) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto status = _worker_ptr->push_sparse_param(
+      table_id, sparse_push_keys.data(), (const float **)push_g_vec.data(),
+      sparse_push_keys.size(), closure);
+  status.wait();
+  return;
+}
+void Communicator::RpcSendSparse(const std::string &var_name, int table_id,
+                                 const Scope &scope) {
+  platform::RecordEvent record_event("Communicator->RpcSendSparse");
+  size_t request_call_num = _worker_ptr->get_server_nums();
+  std::vector<uint64_t> sparse_push_keys;
+  std::vector<float *> push_g_vec;
+  auto *send_var = scope.FindVar(var_name);
+  auto *tensor = send_var->GetMutable<SelectedRows>();
+  auto dim = tensor->value().dims()[1];
+  std::transform(tensor->rows().begin(), tensor->rows().end(),
+                 std::back_inserter(sparse_push_keys),
+                 [&](int id) { return static_cast<uint64_t>(id); });
+  for (auto i = 0; i < static_cast<int>(sparse_push_keys.size()); ++i) {
+    push_g_vec.push_back(tensor->mutable_value()->data<float>() + i * dim);
+  }
+  ++_async_call_num;
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [this, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PUSH_SPARSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+        --_async_call_num;
+      });
+  auto status = _worker_ptr->push_sparse_raw_gradient(
+      table_id, sparse_push_keys.data(), (const float **)push_g_vec.data(),
+      sparse_push_keys.size(), closure);
+  status.wait();
+  return;
+}
+void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
+                                 Scope *scope) {
+  platform::RecordEvent record_event("Communicator->RpcRecvSparse");
+  auto *send_var = scope->Var(varname);
+  auto *tensor = send_var->GetMutable<framework::LoDTensor>();
+  auto dim = tensor->dims()[1];
+  uint64_t sparse_num = static_cast<uint64_t>(tensor->dims()[0]);
+  std::vector<uint64_t> sparse_push_keys(sparse_num);
+  std::iota(sparse_push_keys.begin(), sparse_push_keys.end(), 0);
+  std::vector<float *> push_g_vec;
+  for (auto i = 0; i < static_cast<int>(sparse_push_keys.size()); ++i) {
+    push_g_vec.push_back(tensor->data<float>() + i * dim);
+  }
+  auto status = _worker_ptr->pull_sparse((float **)push_g_vec.data(), table_id,
+                                         sparse_push_keys.data(),
+                                         sparse_push_keys.size());
+  status.wait();
+  return;
+}
+void Communicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {
+  if (trainer_id_ == 0) {
+    for (auto &iter : recv_varname_to_ctx) {
+      auto &table_id = iter.first;
+      auto &varnames = iter.second;
+      RpcSendDenseParam(varnames, table_id, *recv_scope_);
+      VLOG(1) << "push dense param to table " << table_id
+              << " from 0' trainer done";
+    }
+    BarrierWithTable(1);
+  } else {
+    BarrierWithTable(1);
+    for (auto &iter : recv_varname_to_ctx) {
+      auto &table_id = iter.first;
+      auto &varnames = iter.second;
+      RpcRecvDense(varnames, table_id, recv_scope_);
+      VLOG(1) << "pull dense param to table " << table_id
+              << " from 0' trainer done";
+    }
+  }
+  BarrierWithTable(1);
+  return;
+}
+void Communicator::RpcProfilerControl() {
+  if (trainer_id_ == 0) {
+    if (!do_server_profiler_ && platform::IsProfileEnabled()) {
+      // send profiler start flag
+      do_server_profiler_ = true;
+      auto start_status = _worker_ptr->start_profiler();
+      start_status.wait();
+    } else if (do_server_profiler_ && !platform::IsProfileEnabled()) {
+      // send profiler end flag
+      auto stop_status = _worker_ptr->stop_profiler();
+      stop_status.wait();
+      do_server_profiler_ = false;
+    }
+  }
+}
+void AsyncCommunicator::RecvThread() {
+  if (!independent_recv_) return;
+  VLOG(3) << "Independent RecvThread Start and Wait";
+  while (running_) {
+    int grad_num = grad_num_.load();
+    if (grad_num > min_send_grad_num_before_recv_) {
+      RecvByCommunicator();
+      grad_num_.store(0);
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+  }
+  VLOG(1) << "communicator stopped, independent recv thread exit";
+}
+void AsyncCommunicator::RecvByCommunicator() {
+  if (!running_) return;
+  RecvNoBarrier();
+  VLOG(3) << "run recv graph end";
+}
+void AsyncCommunicator::RecvNoBarrier() {
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto &table_id = iter.first;
+    auto &varnames = iter.second;
+    RpcRecvDense(varnames, table_id, recv_scope_);
+  }
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto var_names = iter.second;
+    for (auto &t : var_names) {
+      Variable *var = recv_scope_->FindVar(t);
+      LoDTensor *tensor = var->GetMutable<LoDTensor>();
+      VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
+              << platform::is_gpu_place(tensor->place());
+      if (platform::is_gpu_place(tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
+        LoDTensor *temp_tensor =
+            xpu_temp_scope_->FindVar(t)->GetMutable<LoDTensor>();
+        framework::TensorCopy(*temp_tensor, tensor->place(), tensor);
+#endif
+      }
+    }
+  }
+  return;
+}
+void AsyncCommunicator::SendByCommunicator() {
+  std::vector<std::future<void>> tasks;
+  tasks.reserve(send_varname_to_ctx_.size());
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    auto send_recv_task = [this, &ctx] {
+      auto &varnames = ctx.origin_varnames;
+      auto &table_id = ctx.table_id;
+      size_t var_nums = varnames.size();
+      auto &check_queue = send_varname_to_queue_[varnames[0]];
+      std::vector<std::vector<std::shared_ptr<Variable>>> vars;
+      vars.resize(var_nums);
+      int merged_var_num = 0;
+      int wait_times = 0;
+      while (merged_var_num < max_merge_var_num_) {
+        if (check_queue->Size() == 0) {
+          VLOG(4) << "wait_times -> " << wait_times;
+          if (wait_times >= send_wait_times_) {
+            break;
+          }
+          std::this_thread::sleep_for(std::chrono::milliseconds(10));
+          wait_times++;
+          continue;
+        } else {
+          wait_times = 0;
+          for (size_t i = 0; i < var_nums; i++) {
+            auto &var_name = varnames[i];
+            auto &var_queue = send_varname_to_queue_[var_name];
+            vars[i].push_back(var_queue->Pop());
+          }
+          merged_var_num++;
+        }
+      }
+      if (merged_var_num == 0) return;
+      for (size_t i = 0; i < var_nums; i++) {
+        auto &var_name = varnames[i];
+        MergeVars<float>(var_name, vars[i], send_scope_.get(), 1);
+      }
+      if (ctx.is_sparse) {
+        PADDLE_ENFORCE_EQ(
+            varnames.size(), 1,
+            platform::errors::InvalidArgument(
+                "sparse variables can only be merged by one variables"));
+        RpcSendSparse(varnames[0], table_id, *send_scope_);
+      } else {
+        RpcSendDense(ctx, *send_scope_);
+        if (!independent_recv_ &&
+            recv_varname_to_ctx_.find(table_id) != recv_varname_to_ctx_.end()) {
+          auto recv_varnames = recv_varname_to_ctx_.at(table_id);
+          RpcRecvDense(recv_varnames, table_id, recv_scope_);
+        }
+      }
+      if (independent_recv_) {
+        grad_num_.fetch_add(1, std::memory_order_relaxed);
+      }
+    };
+    tasks.emplace_back(send_threadpool_->enqueue(std::move(send_recv_task)));
+  }
+  for (auto &task : tasks) {
+    task.wait();
+  }
+  return;
+}
+void AsyncCommunicator::MainThread() {
+  VLOG(3) << "AsyncCommunicator MainThread start and wait";
+  while (waiting_ && running_) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    VLOG(3) << "wait for running";
+  }
+  while (running_) {
+    SendByCommunicator();
+    RpcProfilerControl();
+  }
+  VLOG(1) << "communicator stopped, send thread exit";
+}
+void HalfAsyncCommunicator::MainThread() {
+  VLOG(3) << "HalfAsyncCommunicator MainThread start and wait";
+  while (waiting_ && running_) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    VLOG(3) << "wait for running";
+  }
+  while (running_) {
+    SendByCommunicator();
+    BarrierSend();
+    RecvByCommunicator();
+    BarrierRecv();
+    BarrierWeakUp();
+  }
+  VLOG(1) << "communicator stopped, send thread exit";
+}
+void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
+                                 const RecvCtxMap &recv_varname_to_ctx,
+                                 Scope *recv_scope) {
+  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
+  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
+  recv_scope_ = std::move(recv_scope);
+  send_scope_.reset(new Scope());
+  xpu_temp_scope_.reset(new Scope());
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    auto &varnames = ctx.origin_varnames;
+    for (auto &var_name : varnames) {
+      send_varname_to_queue_[var_name] =
+          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
+              send_queue_size_);
+    }
+  }
+  send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
+}
+AsyncCommunicator::~AsyncCommunicator() {
+  running_ = false;
+  if (main_thread_) main_thread_->join();
+  if (recv_thread_) recv_thread_->join();
+}
+void AsyncCommunicator::Start() {
+  VLOG(1) << "Communicator start";
+  if (!communicator_) {
+    VLOG(0) << "Communicator is not inited, do nothing";
+  } else {
+    VLOG(1) << "start send thread and recv thread";
+    waiting_ = true;
+    running_ = true;
+    // flushing_ = false;
+    BarrierTriggerReset(max_merge_var_num_);
+    // start send and recv thread
+    main_thread_.reset(
+        new std::thread(std::bind(&AsyncCommunicator::MainThread, this)));
+    if (independent_recv_) {
+      recv_thread_.reset(
+          new std::thread(std::bind(&AsyncCommunicator::RecvThread, this)));
+    }
+  }
+}
+void AsyncCommunicator::Stop() {
+  VLOG(1) << "Communicator stop";
+  running_ = false;
+  if (!communicator_) {
+    VLOG(0) << "Communicator is not inited, do nothing";
+  } else {
+    if (recv_thread_) {
+      VLOG(1) << "stop recv thread";
+      recv_thread_->join();
+      recv_thread_.reset(nullptr);
+    }
+    if (main_thread_) {
+      VLOG(1) << "stop main thread";
+      main_thread_->join();
+      main_thread_.reset(nullptr);
+    }
+  }
+  VLOG(1) << "Communicator stop done";
+}
+bool AsyncCommunicator::Check(const std::vector<std::string> &var_tables) {
+  PADDLE_ENFORCE_EQ(
+      var_tables.size(), 1,
+      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
+  auto table_name = var_tables[0];
+  if (send_varname_to_ctx_.find(table_name) == send_varname_to_ctx_.end())
+    return false;
+  return true;
+}
+bool AsyncCommunicator::Check(const int table_id) {
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    if (ctx.table_id == table_id) return true;
+  }
+  return false;
+}
+void AsyncCommunicator::Send(const std::vector<std::string> &var_names,
+                             const framework::Scope &scope) {
+  waiting_ = false;
+  for (size_t i = 0; i < var_names.size(); i++) {
+    auto *var = scope.FindVar(var_names[i]);
+    auto tmp_grad_var = std::make_shared<Variable>();
+    framework::CopyVariable(*var, tmp_grad_var.get());
+    send_varname_to_queue_[var_names[i]]->Push(tmp_grad_var);
+  }
+}
+void HalfAsyncCommunicator::Clean() {
+  for (auto &iter : send_varname_to_queue_) {
+    auto &var_name = iter.first;
+    auto &var_queue = iter.second;
+    while (var_queue->Size() > 0) {
+      var_queue->Pop();
+    }
+    VLOG(3) << "clean var: " << var_name << " done";
+  }
+}
+void HalfAsyncCommunicator::BarrierTriggerDecrement() {
+  barrier_trigger_--;
+  VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to "
+          << barrier_trigger_.load();
+}
+void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) {
+  barrier_trigger_.store(initial_val);
+  VLOG(3) << "BarrierTriggerReset reset barrier trigger to "
+          << barrier_trigger_.load();
+}
+void HalfAsyncCommunicator::Barrier() {
+  barrier_counter_++;
+  if (!running_) {
+    VLOG(3) << "Communicator is not running, release barrier";
+    return;
+  }
+  {
+    std::unique_lock<std::mutex> lk(barrier_mutex_);
+    barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); });
+  }
+}
+int HalfAsyncCommunicator::BatchesCounter() {
+  while (running_) {
+    if (barrier_counter_.load() >= barrier_trigger_.load() &&
+        barrier_trigger_.load() != 0) {
+      break;
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+  }
+  return barrier_counter_.load();
+}
+void HalfAsyncCommunicator::SendByCommunicator() {
+  int batches = BatchesCounter();
+  VLOG(1) << "HalfAsyncCommunicator::BatchesCounter = " << batches;
+  if (batches <= 0) return;
+  std::vector<std::future<void>> tasks;
+  tasks.reserve(send_varname_to_ctx_.size());
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    auto send_recv_task = [this, &ctx, batches] {
+      auto &varnames = ctx.origin_varnames;
+      auto &table_id = ctx.table_id;
+      size_t var_nums = varnames.size();
+      std::vector<std::vector<std::shared_ptr<Variable>>> vars;
+      vars.resize(var_nums);
+      for (size_t i = 0; i < var_nums; i++) {
+        auto &var_name = varnames[i];
+        auto &var_queue = send_varname_to_queue_[var_name];
+        for (int j = 0; j < batches; j++) vars[i].push_back(var_queue->Pop());
+        MergeVars<float>(var_name, vars[i], send_scope_.get(), 1);
+      }
+      if (ctx.is_sparse) {
+        PADDLE_ENFORCE_EQ(
+            varnames.size(), 1,
+            platform::errors::InvalidArgument(
+                "sparse variables can only be merged by one variables"));
+        RpcSendSparse(varnames[0], table_id, *send_scope_);
+      } else {
+        RpcSendDense(ctx, *send_scope_);
+      }
+    };
+    tasks.emplace_back(send_threadpool_->enqueue(std::move(send_recv_task)));
+  }
+  for (auto &task : tasks) {
+    task.wait();
+  }
+  return;
+}
+void HalfAsyncCommunicator::BarrierWeakUp() {
+  barrier_counter_.store(0);
+  barrier_cond_.notify_all();
+}
+void SyncCommunicator::BarrierSend() {
+  if (!running_) return;
+  BarrierWithTable(0);
+  VLOG(4) << "BarrierSend with SyncCommunicator";
+}
+void SyncCommunicator::BarrierRecv() {
+  if (!running_) return;
+  BarrierWithTable(1);
+  VLOG(4) << "BarrierRecv with SyncCommunicator";
+}
+void GeoCommunicator::Send(const std::vector<std::string> &var_names,
+                           const framework::Scope &scope) {
+  waiting_ = false;
+  auto before_send = GetCurrentUS();
+  auto table_name = var_names[0];
+  size_t splited_var_nums =
+      send_varname_to_ctx_[table_name].splited_varnames.size();
+  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
+  for (size_t j = 0; j < splited_var_nums; j++) {
+    ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
+        send_varname_to_ctx_[table_name].splited_varnames[j],
+        std::unordered_set<int64_t>()));
+  }
+  auto *var = scope.FindVar(table_name);
+  PADDLE_ENFORCE_EQ(var->IsType<framework::SelectedRows>(), true,
+                    platform::errors::InvalidArgument(
+                        "Only need to send Sparse Grad in Geo mode."));
+  auto &rows = var->Get<framework::SelectedRows>().rows();
+  // insert ids which has not been record
+  for (size_t j = 0; j < rows.size(); j++) {
+    auto ep_idx = rows[j] % splited_var_nums;
+    ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
+        .insert(rows[j]);
+  }
+  for (auto &iter : ids_table) {
+    auto &key = iter.first;
+    auto &sparse_ids_set = iter.second;
+    auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
+    sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
+    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
+    VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
+            << "'s queue";
+  }
+  auto after_send = GetCurrentUS();
+  VLOG(2) << "run send op finish. use time " << (after_send - before_send);
+}
+void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
+                               const RecvCtxMap &recv_varname_to_ctx,
+                               Scope *recv_scope) {
+  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
+  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
+  recv_scope_ = std::move(recv_scope);
+  PADDLE_ENFORCE_GT(
+      send_varname_to_ctx.size(), 0,
+      platform::errors::InvalidArgument("send var contexts can not be zero"));
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    if (!ctx.is_sparse) continue;
+    auto &varnames = ctx.origin_varnames;
+    PADDLE_ENFORCE_EQ(
+        varnames.size(), 1,
+        platform::errors::InvalidArgument(
+            "sparse variables can only be merged by one variables"));
+    for (auto &splited_var : ctx.splited_varnames) {
+      parallel_task_nums_ += 1;
+      sparse_id_queues_.insert(
+          std::pair<std::string, std::shared_ptr<BlockingQueue<
+                                     std::shared_ptr<std::vector<int64_t>>>>>(
+              splited_var,
+              std::make_shared<
+                  BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
+                  send_queue_size_)));
+    }
+  }
+  send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
+  delta_scope_.reset(new Scope());
+  old_scope_.reset(new Scope());
+  pserver_scope_.reset(new Scope());
+}
+void GeoCommunicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {
+  std::vector<std::future<void>> tasks;
+  tasks.reserve(recv_varname_to_ctx_.size());
+  for (auto &iter : recv_varname_to_ctx_) {
+    auto &table_id = iter.first;
+    auto &varnames = iter.second;
+    auto recv_task = [this, &table_id, &varnames] {
+      InitDense(varnames, table_id);
+    };
+    tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
+  }
+  for (auto &task : tasks) {
+    task.wait();
+  }
+  for (auto &iter : send_varname_to_ctx_) {
+    auto &ctx = iter.second;
+    if (!ctx.is_sparse) return;
+    auto &varname = ctx.origin_varnames[0];
+    auto &table_id = ctx.table_id;
+    auto param = varname.substr(0, varname.size() - 5);
+    InitSparse(param, table_id);
+  }
+  return;
+}
+void GeoCommunicator::InitDense(std::vector<std::string> &varnames,
+                                int table_id) {
+  if (trainer_id_ == 0) {
+    RpcSendDenseParam(varnames, table_id, *recv_scope_);
+    BarrierWithTable(1);
+    VLOG(0) << "push dense param to table " << table_id
+            << " from 0' trainer done";
+  } else {
+    BarrierWithTable(1);
+    RpcRecvDense(varnames, table_id, recv_scope_);
+    VLOG(0) << "push dense param to table " << table_id
+            << " from 0' trainer done";
+  }
+  // copy to old_scope
+  for (auto &t : varnames) {
+    auto *global_var = recv_scope_->FindVar(t);
+    global_var->GetMutable<framework::LoDTensor>();
+    auto *old_var = old_scope_->Var(t);
+    old_var->GetMutable<framework::LoDTensor>();
+    framework::CopyVariable(*global_var, old_var);
+  }
+  VLOG(1) << "init dense table " << table_id << " done";
+}
+void GeoCommunicator::SendDense(const CommContext &send_ctx) {
+  platform::RecordEvent record_event("GeoCommunicator->SendDense");
+  auto &var_names = send_ctx.origin_varnames;
+  auto &table_id = send_ctx.table_id;
+  for (auto &varname : var_names) {
+    auto param_name = GradToParam(varname);
+    auto *var_latest = recv_scope_->FindVar(param_name);
+    auto *var_timestamp = old_scope_->FindVar(param_name);
+    PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
+                      platform::errors::Unavailable(
+                          "%s is not initialized, please check", param_name));
+    PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true,
+                      platform::errors::Unavailable(
+                          "%s is not initialized, please check", param_name));
+    auto &t_latest = var_latest->Get<framework::LoDTensor>();
+    auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    auto *var_delta = delta_scope_->Var(varname);
+    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
+    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
+    auto blas =
+        paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+            cpu_ctx);
+    blas.VSUB(t_latest.numel(), t_latest.data<float>(),
+              t_timestamp->data<float>(), t_delta->data<float>());
+    float coefficient = 1.0 / static_cast<float>(trainers_);
+    blas.SCAL(t_latest.numel(), coefficient, t_delta->data<float>());
+    blas.VADD(t_latest.numel(), t_timestamp->data<float>(),
+              t_delta->data<float>(), t_timestamp->data<float>());
+  }
+  RpcSendDense(send_ctx, *delta_scope_);
+  VLOG(1) << "Finish Send Dense " << var_names[0] << ", table_id: " << table_id;
+  return;
+}
+void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
+  platform::RecordEvent record_event("GeoCommunicator->RecvDense");
+  auto &table_id = send_ctx.table_id;
+  auto &varnames = recv_varname_to_ctx_.at(table_id);
+  // 1. recv from pserver
+  RpcRecvDense(varnames, table_id, pserver_scope_.get());
+  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
+  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  for (auto &varname : varnames) {
+    auto *var_latest = recv_scope_->FindVar(varname);
+    auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
+    auto *var_old = old_scope_->FindVar(varname);
+    auto t_old = var_old->GetMutable<framework::LoDTensor>();
+    auto *var_pserver = pserver_scope_->FindVar(varname);
+    auto t_pserver = var_pserver->Get<framework::LoDTensor>();
+    auto *var_delta = delta_scope_->Var(varname);
+    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
+    t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
+    auto blas =
+        paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+            cpu_ctx);
+    blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(),
+              t_delta->data<float>());
+    blas.VADD(t_latest->numel(), t_latest->data<float>(),
+              t_delta->data<float>(), t_latest->data<float>());
+    blas.VCOPY(t_latest->numel(), t_pserver.data<float>(),
+               t_old->data<float>());
+  }
+  VLOG(1) << "Finish Recv Dense " << varnames[0] << ", table_id: " << table_id;
+  return;
+}
+void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) {
+  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
+  if (trainer_id_ == 0) {
+    RpcSendSparseParam(var_name, table_id, *recv_scope_);
+    BarrierWithTable(1);
+    VLOG(0) << "push sparse param to table " << table_id
+            << " from 0' trainer done";
+  } else {
+    BarrierWithTable(1);
+    RpcRecvSparse(var_name, table_id, recv_scope_);
+    VLOG(0) << "push dense param to table " << table_id
+            << " from 0' trainer done";
+  }
+  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " done.";
+  auto *global_var = recv_scope_->FindVar(var_name);
+  auto *var = old_scope_->Var(var_name);
+  framework::CopyVariable(*global_var, var);
+  return;
+}
+std::vector<int64_t> GeoCommunicator::MergeSparseIds(
+    const std::string &send_varname) {
+  size_t merge_num = 0, wait_times = 0;
+  std::unordered_set<int64_t> sparse_ids;
+  while (merge_num < static_cast<size_t>(max_merge_var_num_)) {
+    VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
+    if (sparse_id_queues_.at(send_varname)->Size() > 0) {
+      wait_times = 0;
+      std::shared_ptr<std::vector<int64_t>> pop_ids =
+          sparse_id_queues_.at(send_varname)->Pop();
+      for (size_t j = 0; j < pop_ids->size(); j++) {
+        sparse_ids.insert(pop_ids->at(j));
+      }
+      merge_num += 1;
+      VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed";
+    } else if (sparse_id_queues_.at(send_varname)->Size() == 0) {
+      VLOG(3) << "wait_times -> " << wait_times;
+      if (wait_times >= static_cast<size_t>(send_wait_times_)) {
+        break;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      wait_times++;
+      continue;
+    }
+  }
+  std::vector<int64_t> res;
+  res.assign(sparse_ids.begin(), sparse_ids.end());
+  return res;
+}
+void GeoCommunicator::SendSparse(const std::string &varname,
+                                 std::vector<int64_t> &sparse_ids, int table_id,
+                                 int ep_idx) {
+  platform::RecordEvent record_event("GeoCommunicator->SendSparse");
+  std::string param_name = SplitedGradToParam(varname);
+  VLOG(1) << "In GeoCommunicator::SendSparse(" << varname << " " << param_name
+          << ", ids.size = " << sparse_ids.size() << ", table_id: " << table_id
+          << ", ep_idx: " << ep_idx;
+  auto *var_latest = recv_scope_->FindVar(param_name);
+  auto *var_old = old_scope_->FindVar(param_name);
+  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
+                    platform::errors::Unavailable(
+                        "%s is not initialized, please check", param_name));
+  PADDLE_ENFORCE_EQ(var_old->IsInitialized(), true,
+                    platform::errors::Unavailable(
+                        "%s is not initialized, please check", param_name));
+  auto &t_latest = var_latest->Get<framework::LoDTensor>();
+  auto *t_old = var_old->GetMutable<framework::LoDTensor>();
+  auto dims1 = t_latest.dims()[1];
+  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  auto *var_delta = delta_scope_->Var(varname);
+  auto *t_delta = var_delta->GetMutable<framework::SelectedRows>();
+  auto *var_t_value = t_delta->mutable_value();
+  var_t_value->Resize({static_cast<int64_t>(sparse_ids.size()), dims1});
+  auto *t_value = var_t_value->mutable_data<float>(cpu_ctx.GetPlace());
+  t_delta->set_rows(sparse_ids);
+  t_delta->set_height(t_latest.dims()[0]);
+  auto blas =
+      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+          cpu_ctx);
+  float coefficient = 1.0 / static_cast<float>(trainers_);
+  std::vector<float *> push_g_vec;
+  for (auto j = 0; j < static_cast<int>(sparse_ids.size()); ++j) {
+    blas.VSUB(dims1, t_latest.data<float>() + sparse_ids[j] * dims1,
+              t_old->data<float>() + sparse_ids[j] * dims1,
+              t_value + j * dims1);
+    blas.SCAL(dims1, coefficient, t_value + j * dims1);
+    blas.VADD(dims1, t_old->data<float>() + sparse_ids[j] * dims1,
+              t_value + j * dims1,
+              t_old->data<float>() + sparse_ids[j] * dims1);
+    push_g_vec.push_back(t_value + j * dims1);
+  }
+  ++_async_call_num;
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [this](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_PUSH_SPARSE_TABLE) != 0) {
+      ret = -1;
+    }
+    closure->set_promise_value(ret);
+    --_async_call_num;
+  });
+  auto status = _worker_ptr->push_sparse_raw_gradient_partial(
+      table_id, (const uint64_t *)sparse_ids.data(),
+      (const float **)push_g_vec.data(), sparse_ids.size(), closure, ep_idx);
+  status.wait();
+  VLOG(1) << "Finish Send Sparse " << varname
+          << ", ids.size = " << sparse_ids.size() << ", table_id: " << table_id;
+  return;
+}
+void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
+                                 int ep_idx) {
+  platform::RecordEvent record_event("GeoCommunicator->RecvSparse");
+  // 1. recv from pserver
+  std::vector<uint64_t> keys;
+  std::vector<float> values;
+  auto status = _worker_ptr->pull_geo_param(table_id, &values, &keys, ep_idx);
+  status.wait();
+  std::string param = SplitedGradToParam(varname);
+  VLOG(1) << "RecvSparse receive var: " << varname << " " << param << ", "
+          << table_id << "; ids Size: " << keys.size()
+          << "; values size: " << values.size();
+  auto *var_latest = recv_scope_->FindVar(param);
+  auto *var_old = old_scope_->FindVar(param);
+  auto *t_latest = var_latest->GetMutable<framework::LoDTensor>();
+  auto *t_old = var_old->GetMutable<framework::LoDTensor>();
+  auto dims1 = t_latest->dims()[1];
+  auto numel = keys.size() * dims1;
+  std::vector<float> v_delta;
+  v_delta.resize(numel);
+  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  auto blas =
+      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+          cpu_ctx);
+  for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
+    float *latest_data = t_latest->data<float>() + keys[j] * dims1;
+    float *old_data = t_old->data<float>() + keys[j] * dims1;
+    // pserver - old => delta
+    blas.VSUB(dims1, values.data() + j * dims1, old_data,
+              v_delta.data() + j * dims1);
+    // latest + delta => latest
+    blas.VADD(dims1, latest_data, v_delta.data() + j * dims1, latest_data);
+    // pserver => old
+    blas.VCOPY(dims1, values.data() + j * dims1, old_data);
+  }
+  VLOG(1) << "Finish Recv Sparse " << param << ", table_id: " << table_id;
+}
+void GeoCommunicator::MainThread() {
+  VLOG(3) << "MainThread start and wait";
+  while (waiting_ && running_) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    VLOG(3) << "wait for running";
+  }
+  while (running_) {
+    std::vector<std::future<void>> tasks;
+    tasks.reserve(parallel_task_nums_);
+    for (auto &iter : send_varname_to_ctx_) {
+      auto &ctx = iter.second;
+      auto &varnames = ctx.origin_varnames;
+      auto &table_id = ctx.table_id;
+      if (ctx.is_sparse) {
+        PADDLE_ENFORCE_EQ(
+            varnames.size(), 1,
+            platform::errors::InvalidArgument(
+                "sparse variables can only be merged by one variables"));
+        int pserver_num = static_cast<int>(ctx.epmap.size());
+        for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
+          // varname: emb@GRAD, param_name: emb, splited_varname: emb.delta0
+          auto send_recv_task = [this, table_id, ep_idx, &ctx] {
+            auto splited_varname = ctx.splited_varnames[ep_idx];
+            auto sparse_ids = MergeSparseIds(splited_varname);
+            SendSparse(splited_varname, sparse_ids, table_id, ep_idx);
+            RecvSparse(splited_varname, table_id, ep_idx);
+          };
+          tasks.emplace_back(
+              send_threadpool_->enqueue(std::move(send_recv_task)));
+        }
+      } else {
+        auto send_recv_task = [this, &ctx] {
+          SendDense(ctx);
+          RecvDense(ctx);
+        };
+        tasks.emplace_back(
+            send_threadpool_->enqueue(std::move(send_recv_task)));
+      }
+    }
+    for (auto &task : tasks) {
+      task.wait();
+    }
+  }
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <ThreadPool.h>
+#include <stdint.h>
+#include <atomic>
+#include <deque>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/distributed/communicator_common.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+DECLARE_bool(communicator_is_sgd_optimizer);
+namespace paddle {
+namespace distributed {
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+template <typename T>
+class BlockingQueue {
+ public:
+  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
+    PADDLE_ENFORCE_GT(capacity_, 0,
+                      platform::errors::InvalidArgument(
+                          "The capacity must be greater than 0."));
+  }
+  bool Push(const T &elem) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+      queue_.push_back(elem);
+    }
+    cv_.notify_one();
+    return true;
+  }
+  bool Push(T &&elem) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
+      queue_.emplace_back(std::move(elem));
+    }
+    cv_.notify_one();
+    return true;
+  }
+  T Pop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [=] { return !queue_.empty(); });
+    T rc(std::move(queue_.front()));
+    queue_.pop_front();
+    cv_.notify_one();
+    return rc;
+  }
+  size_t Cap() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return capacity_;
+  }
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+ private:
+  const size_t capacity_;
+  std::deque<T> queue_;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+};
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T>
+inline void MergeVars(const std::string &var_name,
+                      const std::vector<std::shared_ptr<Variable>> &vars,
+                      Scope *scope, bool merge_add = true) {
+  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
+                                            "vector vars are empty."));
+  auto cpu_place = platform::CPUPlace();
+  auto &var0 = vars[0];
+  auto *out_var = scope->Var(var_name);
+  if (var0->IsType<framework::LoDTensor>()) {
+    auto dims = var0->Get<framework::LoDTensor>().dims();
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims
+            << "; merge add: " << merge_add;
+    // init output tensor
+    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
+    out_t->mutable_data<T>(dims, cpu_place);
+    // check the input dims
+    for (auto &var : vars) {
+      auto &var_t = var->Get<framework::LoDTensor>();
+      PADDLE_ENFORCE_EQ(
+          var_t.dims(), dims,
+          platform::errors::InvalidArgument("vars should have the same dims."));
+    }
+    // set output tensor to 0.
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
+        constant_functor;
+    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
+    // sum all vars to out
+    auto result = EigenVector<T>::Flatten(*out_t);
+    for (auto &var : vars) {
+      auto &in_t = var->Get<framework::LoDTensor>();
+      auto in = EigenVector<T>::Flatten(in_t);
+      result.device(*cpu_ctx.eigen_device()) = result + in;
+    }
+    if (!merge_add) {
+      result.device(*cpu_ctx.eigen_device()) =
+          result / static_cast<T>(vars.size());
+    }
+  } else if (var0->IsType<framework::SelectedRows>()) {
+    auto &slr0 = var0->Get<framework::SelectedRows>();
+    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
+    out_slr->mutable_rows()->clear();
+    out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
+    std::vector<const paddle::framework::SelectedRows *> inputs;
+    inputs.reserve(vars.size());
+    for (auto &var : vars) {
+      inputs.push_back(&var->Get<framework::SelectedRows>());
+    }
+    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    if (merge_add) {
+      paddle::operators::math::scatter::MergeAdd<
+          paddle::platform::CPUDeviceContext, T>
+          merge_add;
+      merge_add(dev_ctx, inputs, out_slr);
+    } else {
+      paddle::operators::math::scatter::MergeAverage<
+          paddle::platform::CPUDeviceContext, T>
+          merge_average;
+      merge_average(dev_ctx, inputs, out_slr);
+    }
+    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
+            << " dims: " << slr0.value().dims() << "; merge add: " << merge_add;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!",
+                                                   var0->Type()));
+  }
+}
+using RpcCtxMap = std::unordered_map<std::string, CommContext>;
+using RecvCtxMap = std::unordered_map<uint64_t, std::vector<std::string>>;
+using SparseValue = std::unordered_map<int64_t, std::vector<float>>;
+class Communicator {
+ public:
+  Communicator();
+  explicit Communicator(const std::map<std::string, std::string> &envs_) {
+    VLOG(0) << "Communicator Init Envs";
+    for (auto &iter : envs_) {
+      envs[iter.first] = iter.second;
+      VLOG(0) << iter.first << ": " << iter.second;
+    }
+    barrier_table_id_ = std::stoi(envs.at("barrier_table_id"));
+    trainer_id_ = std::stoi(envs.at("trainer_id"));
+    trainers_ = std::stoi(envs.at("trainers"));
+  }
+  virtual void InitBrpcClient(const std::string &dist_desc,
+                              const std::vector<std::string> &host_sign_list);
+  // 1. recv dense param
+  virtual void RpcRecvDense(const std::vector<std::string> &varnames,
+                            int table_id, Scope *scope);
+  // 2. send dense param
+  virtual void RpcSendDenseParam(const std::vector<std::string> &varnames,
+                                 int table_id, const Scope &scope);
+  // 3. send dense grad
+  virtual void RpcSendDense(const CommContext &ctx, const Scope &scope);
+  // 4. send sparse grad
+  virtual void RpcSendSparse(const std::string &var_name, int table_id,
+                             const Scope &scope);
+  // 5. send sparse param
+  virtual void RpcSendSparseParam(const std::string &varname, int table_id,
+                                  const Scope &scope);
+  // 6. recv sparse param
+  virtual void RpcRecvSparse(const std::string &varname, int table_id,
+                             Scope *scope);
+  virtual ~Communicator() {}
+  virtual void RpcProfilerControl();
+  virtual void InitParams(const RecvCtxMap &recv_varname_to_ctx);
+  virtual void Start() = 0;
+  virtual void Stop() = 0;
+  virtual bool IsRunning() { return running_; }
+  virtual void Clean() {}
+  virtual bool Check(const int table_id) = 0;
+  virtual bool Check(const std::vector<std::string> &var_tables) = 0;
+  virtual void Send(const std::vector<std::string> &var_names,
+                    const framework::Scope &scope) = 0;
+  virtual void RecvNoBarrier() {}
+  virtual void Barrier() {}
+  virtual void BarrierWithTable(uint32_t barrier_type) {
+    auto rets = _worker_ptr->barrier(barrier_table_id_, barrier_type);
+    rets.wait();
+  }
+  virtual void BarrierTriggerDecrement() {}
+  virtual void BarrierTriggerReset(int init_counter) {}
+  virtual void InitEnvs() = 0;
+  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx,
+                        const RecvCtxMap &recv_varname_to_ctx,
+                        Scope *recv_scope) {}
+  static Communicator *GetInstance() { return communicator_.get(); }
+  static std::shared_ptr<Communicator> GetInstantcePtr() {
+    return communicator_;
+  }
+  template <typename T>
+  static Communicator *InitInstance(
+      const RpcCtxMap &send_ctx, const RecvCtxMap &recv_ctx,
+      const std::string &dist_desc,
+      const std::vector<std::string> &host_sign_list, Scope *recv_scope,
+      const std::map<std::string, std::string> &envs) {
+    std::call_once(init_flag_, &Communicator::InitWithRpcCtx<T>, send_ctx,
+                   recv_ctx, dist_desc, host_sign_list, recv_scope,
+                   std::ref(envs));
+    return communicator_.get();
+  }
+  // Init is called by InitInstance.
+  template <typename T>
+  static void InitWithRpcCtx(const RpcCtxMap &send_ctx,
+                             const RecvCtxMap &recv_ctx,
+                             const std::string &dist_desc,
+                             const std::vector<std::string> &host_sign_list,
+                             Scope *recv_scope,
+                             const std::map<std::string, std::string> &envs) {
+    if (communicator_.get() == nullptr) {
+      communicator_.reset(new T(std::ref(envs)));
+      communicator_->InitEnvs();
+      communicator_->InitBrpcClient(dist_desc, host_sign_list);
+      communicator_->InitImpl(send_ctx, recv_ctx, recv_scope);
+    }
+  }
+  PSClient *GetPsClient() { return _worker_ptr.get(); }
+  std::shared_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
+    return _worker_ptr;
+  }
+  std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker
+ protected:
+  bool running_ = false;
+  bool waiting_ = true;
+  bool flushing_ = false;
+  bool do_server_profiler_ = false;
+  static std::shared_ptr<Communicator> communicator_;
+  static std::once_flag init_flag_;
+  std::unordered_map<std::string, std::string> envs;
+  // 计算每个shard 对 dense的存储量
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+  void init_gflag(const std::string &gflags);
+  paddle::distributed::PSParameter _ps_param;
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  int servers_ = 0;
+  int trainers_;
+  int trainer_id_ = 0;
+  int barrier_table_id_ = 0;
+  RpcCtxMap send_varname_to_ctx_;
+  RecvCtxMap recv_varname_to_ctx_;
+  Scope *recv_scope_;  // should be global scope
+  std::unique_ptr<Scope> xpu_temp_scope_;
+  std::atomic<uint32_t> _async_call_num{0};
+};
+class AsyncCommunicator : public Communicator {
+ public:
+  AsyncCommunicator() : Communicator() {}
+  explicit AsyncCommunicator(const std::map<std::string, std::string> &envs)
+      : Communicator(envs) {}
+  ~AsyncCommunicator();
+  void InitEnvs() {
+    independent_recv_ = static_cast<bool>(
+        std::stoi(envs.at("communicator_independent_recv_thread")));
+    min_send_grad_num_before_recv_ =
+        std::stoi(envs.at("communicator_min_send_grad_num_before_recv"));
+    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
+    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
+    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
+    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
+    need_global_step_ =
+        static_cast<bool>(std::stoi(envs.at("need_global_step")));
+  }
+  void Start() override;
+  void Stop() override;
+  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
+                const RecvCtxMap &recv_varname_to_ctx,
+                Scope *recv_scope) override;
+  virtual void MainThread();
+  virtual void RecvThread();
+  virtual bool Check(const int table_id);
+  virtual bool Check(const std::vector<std::string> &var_tables);
+  void Send(const std::vector<std::string> &var_names,
+            const framework::Scope &scope) override;
+  virtual void SendByCommunicator();
+  virtual void SendGlobalStep(int batches) {}
+  virtual void RecvByCommunicator();
+  virtual void RecvNoBarrier();
+  virtual int BatchesCounter() { return 1; }
+  virtual void BarrierSend() {}
+  virtual void BarrierRecv() {}
+  virtual void BarrierWeakUp() {}
+ protected:
+  std::unordered_map<std::string,
+                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
+      send_varname_to_queue_;
+  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
+  int min_send_grad_num_before_recv_;
+  int thread_pool_size_;
+  int max_merge_var_num_;
+  int send_wait_times_;
+  int send_queue_size_;
+  bool need_global_step_ = false;
+  bool independent_recv_ = true;
+  int parallel_task_nums_ = 0;
+  std::unique_ptr<std::thread> main_thread_{nullptr};
+  std::unique_ptr<std::thread> recv_thread_{nullptr};
+  std::unique_ptr<Scope> send_scope_;  // an independent scope
+  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
+};
+class HalfAsyncCommunicator : public AsyncCommunicator {
+ public:
+  HalfAsyncCommunicator() {}
+  explicit HalfAsyncCommunicator(const std::map<std::string, std::string> &envs)
+      : AsyncCommunicator(envs) {}
+  void InitEnvs() {
+    // enfore to recv after send
+    independent_recv_ = false;
+    min_send_grad_num_before_recv_ = 0;
+    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
+    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
+    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
+    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
+    need_global_step_ =
+        static_cast<bool>(std::stoi(envs.at("need_global_step")));
+    VLOG(0) << "HalfAsyncCommunicator Initialized";
+  }
+  void MainThread() override;
+  void SendByCommunicator() override;
+  void Clean() override;
+  void Barrier() override;
+  void BarrierTriggerDecrement() override;
+  void BarrierTriggerReset(int initial_val) override;
+  int BatchesCounter();
+  void BarrierWeakUp();
+ protected:
+  // mutex for Wait for barrier
+  std::mutex barrier_mutex_;
+  std::condition_variable barrier_cond_;
+  std::atomic<int64_t> barrier_trigger_{0};
+  std::atomic<int64_t> barrier_counter_{0};
+};
+class SyncCommunicator : public HalfAsyncCommunicator {
+ public:
+  SyncCommunicator() : HalfAsyncCommunicator() {}
+  explicit SyncCommunicator(const std::map<std::string, std::string> &envs)
+      : HalfAsyncCommunicator(envs) {}
+  void InitEnvs() {
+    // enfore to recv after send
+    independent_recv_ = false;
+    min_send_grad_num_before_recv_ = 0;
+    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
+    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
+    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
+    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
+    need_global_step_ =
+        static_cast<bool>(std::stoi(envs.at("need_global_step")));
+    VLOG(0) << "SyncCommunicator Initialized";
+  }
+  void BarrierSend();
+  void BarrierRecv();
+ private:
+  std::vector<std::string> pserver_endpoints_{};
+};
+class GeoCommunicator : public AsyncCommunicator {
+ public:
+  GeoCommunicator() : AsyncCommunicator() {}
+  explicit GeoCommunicator(const std::map<std::string, std::string> &envs)
+      : AsyncCommunicator(envs) {}
+  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
+                const RecvCtxMap &recv_varname_to_ctx,
+                Scope *recv_scope) override;
+  void InitParams(const RecvCtxMap &recv_varname_to_ctx) override;
+  void InitDense(std::vector<std::string> &varnames, int table_id);
+  void InitSparse(const std::string &var_name, int table_id);
+  void SendDense(const CommContext &send_ctx);
+  void RecvDense(const CommContext &send_ctx);
+  std::vector<int64_t> MergeSparseIds(const std::string &varname);
+  void SendSparse(const std::string &varname, std::vector<int64_t> &sparse_ids,
+                  int table_id, int ep_idx);
+  void RecvSparse(const std::string &varname, int table_id, int ep_idx);
+  void MainThread() override;
+  void InitEnvs() {
+    independent_recv_ = false;
+    min_send_grad_num_before_recv_ = 0;
+    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
+    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
+    // id_queue's size
+    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
+    send_queue_size_ = max_merge_var_num_;
+    VLOG(0) << "GeoCommunicator Initialized";
+  }
+  void Send(const std::vector<std::string> &var_names,
+            const framework::Scope &scope) override;
+  void SendByCommunicator() { return; }
+  void SendGlobalStep(int batches) override { return; }
+  void RecvByCommunicator() override { return; }
+  inline std::string GradToParam(const std::string var_name) {
+    std::string param_name = var_name.substr(0, var_name.size() - 5);
+    return param_name;
+  }
+  inline std::string SplitedGradToParam(const std::string delta_name) {
+    // delta_name: emb.delta0
+    auto pos = delta_name.find(".block");
+    std::string param_name = delta_name.substr(0, pos);
+    return param_name;
+  }
+ private:
+  // parameter for delta calc and send
+  std::shared_ptr<Scope> delta_scope_;
+  // parameter for storage the pserver param after last recv
+  std::shared_ptr<Scope> old_scope_;
+  // parameter on pserver
+  std::shared_ptr<Scope> pserver_scope_;
+  std::unordered_map<
+      std::string,
+      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
+      sparse_id_queues_;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/env.cc
+++ b/paddle/fluid/distributed/service/env.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/env.h"
+namespace paddle {
+namespace distributed {}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <arpa/inet.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+namespace paddle {
+namespace distributed {
+struct PSHost {
+  std::string ip;
+  uint32_t port;
+  uint32_t rank;
+  PSHost() = default;
+  PSHost(const std::string ip, uint32_t port, uint32_t rank)
+      : ip(ip), port(port), rank(rank) {}
+  // |---ip---|---port---|--rank--|
+  // |-32bit--|--20bit---|--12bit-|
+  // for pslib
+  uint64_t serialize_to_uint64() {
+    uint64_t host_label = 0;
+    host_label = inet_addr(ip.c_str());
+    host_label = host_label << 32;
+    host_label += (port << 12);
+    host_label += rank;
+    return host_label;
+  }
+  void parse_from_uint64(uint64_t host_label) {
+    static uint64_t rank_label_mask = (1L << 12) - 1;
+    static uint64_t port_label_mask = (1L << 20) - 1;
+    rank = host_label & rank_label_mask;
+    port = (host_label >> 12) & port_label_mask;
+    uint32_t ip_addr = (host_label >> 32);
+    ip = inet_ntoa(*(in_addr *)&ip_addr);
+  }
+  std::string to_string() {
+    std::stringstream s;
+    s << "host: " << ip;
+    s << " port: " << port;
+    s << " rank: " << rank;
+    s << " uint: " << serialize_to_uint64();
+    return s.str();
+  }
+  // for open source parameter server
+  std::string serialize_to_string() {
+    std::stringstream s;
+    s << ip << ":";
+    s << port << ":";
+    s << rank;
+    return s.str();
+  }
+  void parse_from_string(std::string endpoint) {
+    std::vector<std::string> endpoint_info;
+    string_split(endpoint, ':', &endpoint_info);
+    ip = endpoint_info[0];
+    port = std::stoi(endpoint_info[1]);
+    rank = std::stoi(endpoint_info[2]);
+  }
+  void string_split(const std::string &str, char sep,
+                    std::vector<std::string> *pieces, bool ignore_null = true) {
+    pieces->clear();
+    if (str.empty()) {
+      if (!ignore_null) {
+        pieces->push_back(str);
+      }
+      return;
+    }
+    size_t pos = 0;
+    size_t next = str.find(sep, pos);
+    while (next != std::string::npos) {
+      pieces->push_back(str.substr(pos, next - pos));
+      pos = next + 1;
+      next = str.find(sep, pos);
+    }
+    if (!str.substr(pos).empty()) {
+      pieces->push_back(str.substr(pos));
+    }
+  }
+};
+class PSEnvironment {
+ public:
+  explicit PSEnvironment() {}
+  virtual ~PSEnvironment() {}
+  virtual int32_t set_ps_servers(uint64_t *host_sign_list, int node_num) {
+    return 0;
+  }
+  virtual int32_t set_ps_servers(
+      const std::vector<std::string> *host_endpoint_list, int node_num) {
+    return 0;
+  }
+  virtual int32_t set_ps_clients(uint64_t *host_sign_list, int node_num) {
+    return 0;
+  }
+  virtual int32_t set_ps_clients(std::string *host_endpoint_list,
+                                 int node_num) {
+    return 0;
+  }
+  virtual uint64_t get_local_host_sign() { return 0; }
+  virtual std::vector<PSHost> get_ps_servers() const { return _ps_server_list; }
+  virtual int32_t registe_ps_server(const std::string &ip, uint32_t port,
+                                    int32_t rank) {
+    return registe_ps_host(ip, port, rank, _ps_server_list,
+                           _ps_server_sign_set);
+  }
+  virtual std::vector<PSHost> get_ps_clients() const { return _ps_client_list; }
+  virtual int32_t registe_ps_client(const std::string &ip, uint32_t port,
+                                    int32_t rank) {
+    return registe_ps_host(ip, port, rank, _ps_client_list,
+                           _ps_client_sign_set);
+  }
+  virtual std::vector<uint64_t> get_client_info() {
+    std::vector<uint64_t> client_info;
+    for (auto &i : _ps_client_sign_set) {
+      client_info.push_back(i);
+    }
+    return client_info;
+  }
+  virtual std::vector<std::string> get_client_info(bool use_string_endpoint) {
+    if (use_string_endpoint) {
+      std::vector<std::string> client_info;
+      for (auto &i : _ps_client_list) {
+        client_info.push_back(i.serialize_to_string());
+      }
+      return client_info;
+    }
+    return {};
+  }
+ protected:
+  //注册一个host
+  virtual int32_t registe_ps_host(const std::string &ip, uint32_t port,
+                                  int32_t rank, std::vector<PSHost> &host_list,
+                                  std::unordered_set<uint64_t> &sign_set) {
+    PSHost host;
+    host.ip = ip;
+    host.port = port;
+    host.rank = rank;
+    if (sign_set.count(rank) > 0) {
+      LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
+                   << ", rank:" << host.rank
+                   << " already register, ignore register";
+    } else {
+      host_list.push_back(host);
+      sign_set.insert(rank);
+    }
+    // if (sign_set.count(host.serialize_to_uint64()) > 0) {
+    //   LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
+    //                << ", rank:" << host.rank
+    //                << " already register, ignore register";
+    // } else {
+    //   host_list.push_back(host);
+    //   sign_set.insert(host.serialize_to_uint64());
+    // }
+    return 0;
+  }
+  std::vector<PSHost> _ps_client_list;
+  std::unordered_set<uint64_t> _ps_client_sign_set;  // for unique filter
+  std::vector<PSHost> _ps_server_list;
+  std::unordered_set<uint64_t> _ps_server_sign_set;  // for unique filter
+};
+class PaddlePSEnvironment : public PSEnvironment {
+ public:
+  explicit PaddlePSEnvironment() {}
+  virtual ~PaddlePSEnvironment() {}
+  virtual int32_t set_ps_servers(uint64_t *host_sign_list, int node_num) {
+    _ps_server_list.clear();
+    _ps_server_sign_set.clear();
+    for (int i = 0; i < node_num; ++i) {
+      if (host_sign_list[i] > 0) {
+        PSHost host;
+        host.parse_from_uint64(host_sign_list[i]);
+        _ps_server_list.push_back(host);
+        _ps_server_sign_set.insert(host.serialize_to_uint64());
+      }
+    }
+    std::sort(
+        _ps_server_list.begin(), _ps_server_list.end(),
+        [](const PSHost &h1, const PSHost &h2) { return h1.rank < h2.rank; });
+    return 0;
+  }
+  virtual int32_t set_ps_servers(const std::vector<std::string> *host_sign_list,
+                                 int node_num) {
+    _ps_server_list.clear();
+    _ps_server_sign_set.clear();
+    for (int i = 0; i < node_num; ++i) {
+      if (host_sign_list->at(i) != "") {
+        PSHost host;
+        host.parse_from_string(host_sign_list->at(i));
+        _ps_server_list.push_back(host);
+        _ps_server_sign_set.insert(host.rank);
+      }
+    }
+    std::sort(
+        _ps_server_list.begin(), _ps_server_list.end(),
+        [](const PSHost &h1, const PSHost &h2) { return h1.rank < h2.rank; });
+    return 0;
+  }
+  virtual int32_t set_ps_clients(uint64_t *host_sign_list, int node_num) {
+    _ps_client_list.clear();
+    _ps_client_sign_set.clear();
+    for (int i = 0; i < node_num; ++i) {
+      if (host_sign_list[i] > 0) {
+        PSHost host;
+        host.parse_from_uint64(host_sign_list[i]);
+        _ps_client_list.push_back(host);
+        _ps_client_sign_set.insert(host.serialize_to_uint64());
+      }
+    }
+    std::sort(
+        _ps_client_list.begin(), _ps_client_list.end(),
+        [](const PSHost &h1, const PSHost &h2) { return h1.rank < h2.rank; });
+    return 0;
+  }
+  virtual int32_t set_ps_clients(std::vector<std::string> *host_sign_list,
+                                 int node_num) {
+    _ps_client_list.clear();
+    _ps_client_sign_set.clear();
+    for (int i = 0; i < node_num; ++i) {
+      if (host_sign_list->at(i) != "") {
+        PSHost host;
+        host.parse_from_string(host_sign_list->at(i));
+        _ps_client_list.push_back(host);
+        _ps_client_sign_set.insert(host.rank);
+      }
+    }
+    std::sort(
+        _ps_client_list.begin(), _ps_client_list.end(),
+        [](const PSHost &h1, const PSHost &h2) { return h1.rank < h2.rank; });
+    return 0;
+  }
+  virtual uint64_t get_local_host_sign() {
+    if (_ps_client_list.size() > 0) {
+      return _ps_client_list[0].serialize_to_uint64();
+    } else {
+      return 0;
+    }
+  }
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+DECLARE_int32(rpc_deadline);
+namespace paddle {
+namespace distributed {
+DEFINE_int32(pserver_timeout_ms, 10800000, "pserver request server timeout_ms");
+std::shared_ptr<HeterClient> HeterClient::s_instance_ = NULL;
+bool HeterClient::is_initialized_ = false;
+void HeterClient::MainThread() {
+  while (running_) {
+    RpcProfilerControl();
+  }
+}
+void HeterClient::Stop() {
+  running_ = false;
+  if (!is_initialized_) {
+    VLOG(0) << "HeterClient is not inited, do nothing";
+  } else {
+    if (main_thread_) {
+      auto status = StopHeterWorker();
+      status.wait();
+      main_thread_->join();
+      main_thread_.reset(nullptr);
+    }
+    VLOG(1) << "HeterClient Stop Done";
+  }
+}
+void HeterClient::RpcProfilerControl() {
+  if (trainer_id_ == 0) {
+    if (!do_server_profiler_ && platform::IsProfileEnabled()) {
+      // send profiler start flag
+      do_server_profiler_ = true;
+      auto start_status = StartProfiler();
+      start_status.wait();
+    } else if (do_server_profiler_ && !platform::IsProfileEnabled()) {
+      // send profiler end flag
+      auto stop_status = StopProfiler();
+      stop_status.wait();
+      do_server_profiler_ = false;
+    }
+  }
+}
+void HeterClient::CreateClient2XpuConnection() {
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "single";
+  options.timeout_ms = pserver_timeout_ms;
+  xpu_channels_.resize(xpu_list_.size());
+  for (size_t i = 0; i < xpu_list_.size(); ++i) {
+    xpu_channels_[i].reset(new brpc::Channel());
+    if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) {
+      VLOG(0) << "HeterServer channel init fail";
+    }
+  }
+}
+void HeterClient::SendAndRecvAsync(
+    const std::vector<std::string>& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& message_name,
+    const std::vector<std::string>& send_var_name,
+    const std::vector<std::string>& recv_var_name) {
+  platform::RecordEvent record_event("HeterClient->SendAndRecvAsync");
+  const platform::DeviceContext* p_ctx = &ctx;
+  const framework::Scope* p_scope = &scope;
+  const std::string message_name_val = message_name;
+  const std::vector<std::string> send_var_name_val = send_var_name;
+  const std::vector<std::string> recv_var_name_val = recv_var_name;
+  VLOG(3) << "GRPCClient::SendAndRecv Begin, message_name: "
+          << message_name_val;
+  // Todo: get correct channel
+  int num = trainer_id_ % xpu_channels_.size();
+  brpc::Controller cntl;
+  cntl.set_timeout_ms(pserver_timeout_ms);
+  distributed::MultiVarMsg request, response;
+  auto& request_io_buffer = cntl.request_attachment();
+  ::paddle::PsService_Stub stub(xpu_channels_[num].get());
+  distributed::SerializeToMultiVarMsgAndIOBuf(
+      message_name_val, send_var_name_val, recv_var_name_val, *p_ctx, p_scope,
+      &request, &request_io_buffer);
+  stub.SendAndRecvVariable(&cntl, &request, &response, NULL);
+  PADDLE_ENFORCE_NE(
+      cntl.Failed(), true,
+      platform::errors::Unimplemented(
+          "HeterClient::SendAndRecv meets brpc error, error message is %s",
+          cntl.ErrorText()));
+  VLOG(4) << "call heter_worker success";
+  auto& response_io_buffer = cntl.response_attachment();
+  distributed::DeserializeFromMultiVarMsgAndIOBuf(response, &response_io_buffer,
+                                                  ctx, p_scope);
+}
+std::future<int32_t> HeterClient::SendCmd(
+    uint32_t table_id, int cmd_id, const std::vector<std::string>& params) {
+  size_t request_call_num = xpu_channels_.size();
+  paddle::distributed::DownpourBrpcClosure* closure =
+      new paddle::distributed::DownpourBrpcClosure(
+          request_call_num, [request_call_num, cmd_id](void* done) {
+            int ret = 0;
+            auto* closure = (paddle::distributed::DownpourBrpcClosure*)done;
+            for (size_t i = 0; i < request_call_num; ++i) {
+              if (closure->check_response(i, cmd_id) != 0) {
+                ret = -1;
+                break;
+              }
+            }
+            closure->set_promise_value(ret);
+          });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(cmd_id);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(trainer_id_);
+    for (const auto& param : params) {
+      closure->request(i)->add_params(param);
+    }
+    ::paddle::PsService_Stub rpc_stub(xpu_channels_[i].get());
+    closure->cntl(i)->set_timeout_ms(
+        pserver_timeout_ms);  // cmd msg don't limit timeout for save/load
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+std::future<int32_t> HeterClient::StartProfiler() {
+  return SendCmd(-1, PS_START_PROFILER, {});
+}
+std::future<int32_t> HeterClient::StopProfiler() {
+  return SendCmd(-1, PS_STOP_PROFILER, {});
+}
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/heter_client.h
+++ b/paddle/fluid/distributed/service/heter_client.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+namespace paddle {
+namespace distributed {
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+typedef std::function<void(void*)> HeterRpcCallbackFunc;
+class OnHeterRpcDone : public google::protobuf::Closure {
+ public:
+  OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
+  virtual ~OnHeterRpcDone() {}
+  void Run() {
+    std::unique_ptr<OnHeterRpcDone> self_guard(this);
+    handler_(this);
+  }
+  HeterRpcCallbackFunc handler_;
+  MultiVariableMessage response;
+  brpc::Controller cntl;
+};
+class HeterClient {
+ public:
+  virtual ~HeterClient() {}
+  HeterClient() {
+    running_ = true;
+    main_thread_.reset(
+        new std::thread(std::bind(&HeterClient::MainThread, this)));
+  }
+  void CreateClient2XpuConnection();
+  void SendAndRecvAsync(const std::vector<std::string>& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& message_name,
+                        const std::vector<std::string>& send_var_name,
+                        const std::vector<std::string>& recv_var_name);
+  // HeterClient singleton
+  static std::shared_ptr<HeterClient> GetInstance(
+      const std::vector<std::string>& endpoint, const int& trainer_id) {
+    if (NULL == s_instance_) {
+      is_initialized_ = true;
+      s_instance_.reset(new paddle::distributed::HeterClient());
+      std::vector<std::string> xpu_list = {endpoint};
+      s_instance_->SetXpuList(endpoint);
+      s_instance_->SetTrainerID(trainer_id);
+      s_instance_->CreateClient2XpuConnection();
+    }
+    return s_instance_;
+  }
+  void Stop();
+  void MainThread();
+  void RpcProfilerControl();
+  std::future<int32_t> SendCmd(uint32_t table_id, int cmd_id,
+                               const std::vector<std::string>& params);
+  std::future<int32_t> StartProfiler();
+  std::future<int32_t> StopProfiler();
+  std::future<int32_t> StopHeterWorker();
+  std::vector<std::string>& GetXpuList() { return xpu_list_; }
+  void SetXpuList(const std::vector<std::string>& xpu_list) {
+    xpu_list_ = xpu_list;
+  };
+  void SetTrainerID(const int& trainer_id) { trainer_id_ = trainer_id; }
+ private:
+  static std::shared_ptr<HeterClient> s_instance_;
+ protected:
+  static bool is_initialized_;
+  std::unique_ptr<std::thread> main_thread_{nullptr};
+  std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
+  DISABLE_COPY_AND_ASSIGN(HeterClient);
+  std::vector<std::string> xpu_list_;
+  bool running_ = false;
+  int trainer_id_;
+  bool do_server_profiler_ = false;
+};
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/heter_server.cc
+++ b/paddle/fluid/distributed/service/heter_server.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/heter_server.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
+namespace paddle {
+namespace distributed {
+std::shared_ptr<HeterServer> HeterServer::s_instance_ = NULL;
+void HeterServer::RegisterServiceHandler(std::string message_name,
+                                         HeterServiceHandler func) {
+  service_.RegisterServiceHandler(message_name, func);
+}
+void HeterServer::StartHeterService() {
+  server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
+  brpc::ServerOptions options;
+  if (server_.Start(endpoint_.c_str(), &options) != 0) {
+    VLOG(0) << "heter server start fail";
+  } else {
+    VLOG(0) << "heter server start success! listen on " << endpoint_;
+  }
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+  server_.Join();
+}
+void HeterServer::SetEndPoint(std::string& endpoint) {
+  endpoint_ = endpoint;
+  service_.SetEndpoint(endpoint);
+}
+void HeterServer::SetFanin(int& fan_in) { service_.SetFanin(fan_in); }
+void HeterServer::WaitServerReady() {
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+}
+int32_t HeterService::stop_profiler(const PsRequestMessage& request,
+                                    PsResponseMessage& response,
+                                    brpc::Controller* cntl) {
+  platform::DisableProfiler(
+      platform::EventSortingKey::kDefault,
+      string::Sprintf("heter_worker_%s_profile", endpoint_));
+  return 0;
+}
+int32_t HeterService::start_profiler(const PsRequestMessage& request,
+                                     PsResponseMessage& response,
+                                     brpc::Controller* cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kAll);
+  return 0;
+}
+int32_t HeterService::stop_heter_worker(const PsRequestMessage& request,
+                                        PsResponseMessage& response,
+                                        brpc::Controller* cntl) {
+  auto client_id = request.client_id();
+  stop_cpu_worker_set_.insert(client_id);
+  if (stop_cpu_worker_set_.size() == fan_in_) {
+    is_exit_ = true;
+  }
+  return 0;
+}
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+class HeterService;
+typedef int32_t (HeterService::*serviceHandlerFunc)(
+    const PsRequestMessage& request, PsResponseMessage& response,
+    brpc::Controller* cntl);
+typedef std::function<void(void*)> HeterRpcCallbackFunc;
+typedef std::function<int(const MultiVarMsg*, MultiVarMsg*, brpc::Controller*)>
+    HeterServiceHandler;
+class HeterService : public ::paddle::PsService {
+ public:
+  HeterService() {
+    _service_handler_map[PS_STOP_SERVER] = &HeterService::stop_heter_worker;
+    _service_handler_map[PS_START_PROFILER] = &HeterService::start_profiler;
+    _service_handler_map[PS_STOP_PROFILER] = &HeterService::stop_profiler;
+  }
+  virtual ~HeterService() {}
+  virtual void service(::google::protobuf::RpcController* controller,
+                       const ::paddle::PsRequestMessage* request,
+                       ::paddle::PsResponseMessage* response,
+                       ::google::protobuf::Closure* done) {
+    brpc::ClosureGuard done_guard(done);
+    std::string log_label("ReceiveCmd-");
+    response->set_err_code(0);
+    response->set_err_msg("");
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
+    auto itr = _service_handler_map.find(request->cmd_id());
+    if (itr == _service_handler_map.end()) {
+      std::string err_msg(
+          "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+      err_msg.append(std::to_string(request->cmd_id()));
+      return;
+    }
+    serviceHandlerFunc handler_func = itr->second;
+    int service_ret = (this->*handler_func)(*request, *response, cntl);
+    if (service_ret != 0) {
+      response->set_err_code(service_ret);
+      response->set_err_msg("server internal error");
+    }
+  };
+  void SendAndRecvVariable(::google::protobuf::RpcController* controller,
+                           const MultiVarMsg* request, MultiVarMsg* response,
+                           ::google::protobuf::Closure* done) {
+    brpc::ClosureGuard done_guard(done);
+    std::string message_name = request->message_name();
+    auto itr = handler_map_.find(message_name);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
+    PADDLE_ENFORCE_NE(
+        itr, handler_map_.end(),
+        platform::errors::InvalidArgument(
+            "HeterService::SendAndRecvVariable Get illegal message_name: %s "
+            "which is not in HeterService::handler_map_",
+            message_name));
+    itr->second(request, response, cntl);
+  }
+  void RegisterServiceHandler(std::string message_name,
+                              HeterServiceHandler func) {
+    handler_map_[message_name] = func;
+  }
+  void SetEndpoint(const std::string& end_point) { endpoint_ = end_point; }
+  void SetFanin(const int& fan_in) { fan_in_ = fan_in; }
+  bool IsExit() { return is_exit_; }
+ private:
+  int32_t stop_profiler(const PsRequestMessage& request,
+                        PsResponseMessage& response, brpc::Controller* cntl);
+  int32_t start_profiler(const PsRequestMessage& request,
+                         PsResponseMessage& response, brpc::Controller* cntl);
+  int32_t stop_heter_worker(const PsRequestMessage& request,
+                            PsResponseMessage& response,
+                            brpc::Controller* cntl);
+ private:
+  std::string endpoint_;
+  std::unordered_map<std::string, HeterServiceHandler> handler_map_;
+  std::unordered_map<int32_t, serviceHandlerFunc> _service_handler_map;
+  std::unordered_set<int> stop_cpu_worker_set_;
+  int fan_in_;
+  bool is_exit_ = false;
+};
+class HeterServer {
+ public:
+  virtual ~HeterServer() {}
+  void Stop() {
+    server_.Stop(1000);
+    server_.Join();
+  }
+  bool IsExit() { return service_.IsExit(); }
+  HeterServer() {}
+  void RegisterServiceHandler(std::string message_name,
+                              HeterServiceHandler func);
+  void StartHeterService();
+  void SetEndPoint(std::string& endpoint);
+  void SetFanin(int& fan_in);
+  // HeterWrapper singleton
+  static std::shared_ptr<HeterServer> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new HeterServer());
+    }
+    return s_instance_;
+  }
+  void WaitServerReady();
+ private:
+  static std::shared_ptr<HeterServer> s_instance_;
+  std::string endpoint_;
+ protected:
+  brpc::Server server_;
+  HeterService service_;
+  DISABLE_COPY_AND_ASSIGN(HeterServer);
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
+};
+class HeterRequestHandler {
+ public:
+  HeterRequestHandler()
+      : dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr) {}
+  virtual ~HeterRequestHandler() {}
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    message_to_prepared_ctx_ = g;
+  }
+  virtual int Handle(const MultiVarMsg* request, MultiVarMsg* response,
+                     brpc::Controller* cntl) = 0;
+ protected:
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      message_to_prepared_ctx_;
+};
+class RequestSendAndRecvHandler final : public HeterRequestHandler {
+ public:
+  RequestSendAndRecvHandler() {}
+  virtual ~RequestSendAndRecvHandler() {}
+  int Handle(const MultiVarMsg* request, MultiVarMsg* response,
+             brpc::Controller* cntl) override {
+    platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle");
+    auto& local_scope = scope_->NewScope();
+    auto message_name = request->message_name();
+    auto& request_io_buffer = cntl->request_attachment();
+    distributed::DeserializeFromMultiVarMsgAndIOBuf(
+        *request, &request_io_buffer, *dev_ctx_, &local_scope);
+    executor_->RunPreparedContext(
+        (*message_to_prepared_ctx_)[message_name].get(), &local_scope, false);
+    auto response_var_nums = request->recv_var_names_size();
+    std::vector<std::string> response_var_names(response_var_nums),
+        empty_var_names{};
+    for (int var_idx = 0; var_idx < response_var_nums; ++var_idx) {
+      response_var_names[var_idx] = request->recv_var_names(var_idx);
+    }
+    auto& response_io_buffer = cntl->response_attachment();
+    distributed::SerializeToMultiVarMsgAndIOBuf(
+        message_name, response_var_names, empty_var_names, *dev_ctx_,
+        &local_scope, response, &response_io_buffer);
+    scope_->DeleteScope(&local_scope);
+    return 0;
+  }
+};
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include <map>
+#include "brpc/server.h"
+#include "glog/logging.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+REGISTER_CLASS(PSClient, BrpcPsClient);
+int32_t PSClient::configure(
+    const PSParameter &config,
+    const std::map<uint64_t, std::vector<paddle::distributed::Region>> &regions,
+    PSEnvironment &env, size_t client_id) {
+  _env = &env;
+  _config = config;
+  _dense_pull_regions = regions;
+  _client_id = client_id;
+  _config.mutable_worker_param()
+      ->mutable_downpour_worker_param()
+      ->mutable_downpour_table_param()
+      ->CopyFrom(_config.server_param()
+                     .downpour_server_param()
+                     .downpour_table_param());
+  const auto &work_param = _config.worker_param().downpour_worker_param();
+  for (size_t i = 0; i < work_param.downpour_table_param_size(); ++i) {
+    auto *accessor = CREATE_CLASS(
+        ValueAccessor,
+        work_param.downpour_table_param(i).accessor().accessor_class());
+    accessor->configure(work_param.downpour_table_param(i).accessor());
+    accessor->initialize();
+    _table_accessors[work_param.downpour_table_param(i).table_id()].reset(
+        accessor);
+  }
+  return initialize();
+}
+PSClient *PSClientFactory::create(const PSParameter &ps_config) {
+  const auto &config = ps_config.server_param();
+  if (!config.has_downpour_server_param()) {
+    LOG(ERROR) << "miss downpour_server_param in ServerParameter";
+    return NULL;
+  }
+  if (!config.downpour_server_param().has_service_param()) {
+    LOG(ERROR) << "miss service_param in ServerParameter.downpour_server_param";
+    return NULL;
+  }
+  if (!config.downpour_server_param().service_param().has_client_class()) {
+    LOG(ERROR) << "miss client_class in "
+                  "ServerParameter.downpour_server_param.service_param";
+    return NULL;
+  }
+  const auto &service_param = config.downpour_server_param().service_param();
+  PSClient *client = CREATE_CLASS(PSClient, service_param.client_class());
+  if (client == NULL) {
+    LOG(ERROR) << "client is not registered, server_name:"
+               << service_param.client_class();
+    return NULL;
+  }
+  TableManager::instance().initialize();
+  LOG(INFO) << "Create PSClient[" << service_param.client_class()
+            << "] success";
+  return client;
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <future>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/table/accessor.h"
+namespace paddle {
+namespace distributed {
+typedef std::function<void(void *)> PSClientCallBack;
+class PSClientClosure : public google::protobuf::Closure {
+ public:
+  PSClientClosure(PSClientCallBack callback) : _callback(callback) {}
+  virtual ~PSClientClosure() {}
+  virtual void set_promise_value(int value) {
+    for (auto &promise : _promises) {
+      promise->set_value(value);
+    }
+  }
+  void add_promise(std::shared_ptr<std::promise<int32_t>> &promise) {
+    _promises.push_back(promise);
+  }
+ protected:
+  PSClientCallBack _callback;
+  std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
+};
+class PSClient {
+ public:
+  PSClient() {}
+  virtual ~PSClient() {}
+  PSClient(PSClient &&) = delete;
+  PSClient(const PSClient &) = delete;
+  virtual int32_t configure(
+      const PSParameter &config,
+      const std::map<uint64_t, std::vector<paddle::distributed::Region>>
+          &regions,
+      PSEnvironment &_env, size_t client_id) final;
+  virtual int32_t create_client2client_connection(
+      int pserver_timeout_ms, int pserver_connect_timeout_ms,
+      int max_retry) = 0;
+  // 触发table数据退场
+  virtual std::future<int32_t> shrink(uint32_t table_id) = 0;
+  // 全量table进行数据load
+  virtual std::future<int32_t> load(const std::string &epoch,
+                                    const std::string &mode) = 0;
+  // 指定table数据load
+  virtual std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
+                                    const std::string &mode) = 0;
+  // 全量table数据save  value_accessor根据mode，可能有不同的save条件
+  virtual std::future<int32_t> save(const std::string &epoch,
+                                    const std::string &mode) = 0;
+  // 指定table数据save  value_accessor根据mode，可能有不同的save条件
+  virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
+                                    const std::string &mode) = 0;
+  //清空table数据
+  virtual std::future<int32_t> clear() = 0;
+  virtual std::future<int32_t> clear(uint32_t table_id) = 0;
+  // pull dense的参数部分，并分块填充到本地网络参数中
+  // start和num用于拉取部分参数
+  // future结束前keys和values缓冲区不能再次使用
+  // client将values按照区块拆包后送交多个sender
+  // sender聚集同一区块的请求，累计多个填充buffer
+  // server将参数区块中配置的某一维提取返回
+  // 返回数据解包后填充到累计的多个buffer中
+  virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
+                                          size_t table_id) = 0;  //保留
+  // firstly push dense param for parameter server
+  // this is neccessary because dense weight initialized in trainer on cold
+  // start
+  virtual std::future<int32_t> push_dense_param(const Region *regions,
+                                                size_t region_num,
+                                                size_t table_id) = 0;
+  // 使用keys进行pull请求，结果填充values
+  // keys和values的个数均为num个，每个value占用select_size空间
+  // future结束前keys和values缓冲区不能再次使用
+  // 整合多个线程请求的keys，聚集并分散发送到server
+  // 返回结果后，遍历buffer并对values赋值
+  virtual std::future<int32_t> pull_sparse(float **select_values,
+                                           size_t table_id,
+                                           const uint64_t *keys,
+                                           size_t num) = 0;
+  virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;
+  // 确保所有积攒中的请求都发起发送
+  virtual std::future<int32_t> flush() = 0;
+  // server优雅退出
+  virtual std::future<int32_t> stop_server() = 0;
+  // server profilera
+  virtual std::future<int32_t> start_profiler() = 0;
+  virtual std::future<int32_t> stop_profiler() = 0;
+  virtual std::future<int32_t> barrier(size_t table_id,
+                                       uint32_t barrier_type) = 0;
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float> *values,
+                                              std::vector<uint64_t> *keys,
+                                              int pserver_idx) = 0;
+  virtual void finalize_worker() = 0;
+  // client to client, 消息发送
+  virtual std::future<int32_t> send_client2client_msg(int msg_type,
+                                                      int to_client_id,
+                                                      const std::string &msg) {
+    LOG(FATAL) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
+  // client2client消息处理，std::function<int32_t (int, int, const std::string&)
+  // -> ret (msg_type, from_client_id, msg)
+  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
+  virtual int registe_client2client_msg_handler(int msg_type,
+                                                MsgHandlerFunc handler) {
+    _msg_handler_map[msg_type] = handler;
+    return 0;
+  }
+  virtual int handle_client2client_msg(int msg_type, int from_client_id,
+                                       const std::string &msg) {
+    auto itr = _msg_handler_map.find(msg_type);
+    if (itr == _msg_handler_map.end()) {
+      LOG(WARNING) << "unknown client2client_msg type:" << msg_type;
+      return -1;
+    }
+    return itr->second(msg_type, from_client_id, msg);
+  }
+  virtual ValueAccessor *table_accessor(size_t table_id) {
+    auto itr = _table_accessors.find(table_id);
+    if (itr == _table_accessors.end()) {
+      return NULL;
+    }
+    return itr->second.get();
+  }
+  virtual size_t get_server_nums() = 0;
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float *total_send_data, size_t total_send_data_size,
+      void *done) = 0;
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t *keys, const float **update_values,
+      size_t num, void *done) = 0;
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t *keys, const float **update_values,
+      uint32_t num, void *done, int pserver_idx) = 0;
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t *keys,
+                                                 const float **update_values,
+                                                 size_t num, void *done) = 0;
+ protected:
+  virtual int32_t initialize() = 0;
+  size_t _client_id;
+  PSParameter _config;
+  std::map<uint64_t, std::vector<paddle::distributed::Region>>
+      _dense_pull_regions;
+  PSEnvironment *_env;
+  std::unordered_map<uint32_t, std::shared_ptr<ValueAccessor>> _table_accessors;
+  std::unordered_map<int32_t, MsgHandlerFunc>
+      _msg_handler_map;  //处理client2client消息
+};
+REGISTER_REGISTERER(PSClient);
+class PSClientFactory {
+ public:
+  static PSClient *create(const PSParameter &config);
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+package paddle;
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+enum PsCmdID {
+  PS_PULL_DENSE_TABLE = 0;
+  PS_PUSH_DENSE_TABLE = 1;
+  PS_PULL_SPARSE_TABLE = 2;
+  PS_PUSH_SPARSE_TABLE = 3;
+  PS_SHRINK_TABLE = 4;
+  PS_SAVE_ONE_TABLE = 5;
+  PS_SAVE_ALL_TABLE = 6;
+  PS_LOAD_ONE_TABLE = 7;
+  PS_LOAD_ALL_TABLE = 8;
+  PS_CLEAR_ONE_TABLE = 9;
+  PS_CLEAR_ALL_TABLE = 10;
+  PS_PUSH_DENSE_PARAM = 11;
+  PS_STOP_SERVER = 12;
+  PS_SAVE_ONE_CACHE_TABLE = 13;
+  PS_GET_CACHE_THRESHOLD = 14;
+  PS_CACHE_SHUFFLE = 15;
+  PS_COPY_TABLE = 16;
+  PS_COPY_TABLE_BY_FEASIGN = 17;
+  PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY = 18;
+  PS_PUSH_SPARSE_TABLE_WITH_DEPENDENCY = 19;
+  PS_PRINT_TABLE_STAT = 20;
+  PS_SAVE_ONE_TABLE_PREFIX = 21;
+  PS_SAVE_ONE_TABLE_WITH_WHITELIST = 22;
+  PS_LOAD_ONE_TABLE_WITH_WHITELIST = 23;
+  PS_PULL_GEO_PARAM = 24;
+  PS_BARRIER = 25;
+  PS_PUSH_SPARSE_PARAM = 26;
+  PS_START_PROFILER = 27;
+  PS_STOP_PROFILER = 28;
+}
+message PsRequestMessage {
+  required uint32 cmd_id = 1;
+  optional uint32 table_id = 2;
+  repeated bytes params = 3;
+  optional int32 client_id = 4;
+  optional bytes data = 5;
+};
+message PsResponseMessage {
+  required int32 err_code = 1 [ default = 0 ];
+  required string err_msg = 2 [ default = "" ];
+  optional bytes data = 3;
+};
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+}
+message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+  message LodData { repeated int64 lod_data = 1; }
+  optional string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  optional VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  optional Type data_type = 3;
+  repeated int64 dims = 4;
+  // lod details:
+  optional int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  optional int64 slr_height = 7;
+  // tensor data
+  optional bytes data = 8;
+}
+// for SendAndRecv RPC method
+message MultiVariableMessage {
+  // message flags
+  required string message_name = 1;
+  repeated string send_var_names = 2;
+  repeated string recv_var_names = 3;
+  repeated VariableMessage var_messages = 4;
+};
+service PsService {
+  rpc service(PsRequestMessage) returns (PsResponseMessage);
+  rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage);
+};
\ No newline at end of file
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/distributed/service/server.h"
+#include "glog/logging.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+REGISTER_CLASS(PSServer, BrpcPsServer);
+REGISTER_CLASS(PsBaseService, PsService);
+PSServer *PSServerFactory::create(const PSParameter &ps_config) {
+  const auto &config = ps_config.server_param();
+  if (!config.has_downpour_server_param()) {
+    LOG(ERROR) << "miss downpour_server_param in ServerParameter";
+    return NULL;
+  }
+  if (!config.downpour_server_param().has_service_param()) {
+    LOG(ERROR) << "miss service_param in ServerParameter.downpour_server_param";
+    return NULL;
+  }
+  if (!config.downpour_server_param().service_param().has_server_class()) {
+    LOG(ERROR) << "miss server_class in "
+                  "ServerParameter.downpour_server_param.service_param";
+    return NULL;
+  }
+  const auto &service_param = config.downpour_server_param().service_param();
+  PSServer *server = CREATE_CLASS(PSServer, service_param.server_class());
+  if (server == NULL) {
+    LOG(ERROR) << "server is not registered, server_name:"
+               << service_param.server_class();
+    return NULL;
+  }
+  TableManager::instance().initialize();
+  return server;
+}
+int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
+                            size_t server_rank) {
+  _config = config.server_param();
+  _rank = server_rank;
+  _environment = &env;
+  _shuffled_ins =
+      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
+  const auto &downpour_param = _config.downpour_server_param();
+  uint32_t barrier_table = UINT32_MAX;
+  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+    auto *table = CREATE_CLASS(
+        Table, downpour_param.downpour_table_param(i).table_class());
+    if (downpour_param.downpour_table_param(i).table_class() ==
+        "BarrierTable") {
+      barrier_table = downpour_param.downpour_table_param(i).table_id();
+    }
+    table->initialize(downpour_param.downpour_table_param(i),
+                      config.fs_client_param());
+    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
+  }
+  if (barrier_table != UINT32_MAX) {
+    _table_map[barrier_table]->set_table_map(&_table_map);
+  }
+  return initialize();
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/server.h
+++ b/paddle/fluid/distributed/service/server.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <future>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "butil/endpoint.h"
+#include "google/protobuf/service.h"
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/channel.h"
+namespace paddle {
+namespace distributed {
+class Table;
+class PSServer {
+ public:
+  PSServer() {}
+  virtual ~PSServer() {}
+  PSServer(PSServer &&) = delete;
+  PSServer(const PSServer &) = delete;
+  virtual int32_t configure(const PSParameter &config, PSEnvironment &env,
+                            size_t server_rank) final;
+  // return server_ip
+  virtual std::string ip() { return butil::my_ip_cstr(); }
+  // return server_port
+  virtual int32_t port() = 0;
+  virtual uint64_t start(const std::string &ip, uint32_t port) = 0;
+  virtual int32_t stop() = 0;
+  inline size_t rank() const { return _rank; }
+  inline PSEnvironment *environment() { return _environment; }
+  inline const ServerParameter *config() const { return &_config; }
+  inline Table *table(size_t table_id) {
+    auto itr = _table_map.find(table_id);
+    if (itr != _table_map.end()) {
+      return itr->second.get();
+    }
+    return NULL;
+  }
+  inline std::unordered_map<uint32_t, std::shared_ptr<Table>> *table() {
+    return &_table_map;
+  }
+  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
+  virtual int registe_pserver2pserver_msg_handler(int msg_type,
+                                                  MsgHandlerFunc handler) {
+    _msg_handler_map[msg_type] = handler;
+    return 0;
+  }
+  paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
+ protected:
+  virtual int32_t initialize() = 0;
+ protected:
+  size_t _rank;
+  ServerParameter _config;
+  PSEnvironment *_environment;
+  std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
+  std::unordered_map<int32_t, MsgHandlerFunc> _msg_handler_map;
+};
+REGISTER_REGISTERER(PSServer);
+typedef std::function<void(void *)> PServerCallBack;
+class PServerClosure : public google::protobuf::Closure {
+ public:
+  PServerClosure(PServerCallBack callback) : _callback(callback) {}
+  virtual ~PServerClosure() {}
+  virtual void set_promise_value(int value) {
+    for (auto &promise : _promises) {
+      promise->set_value(value);
+    }
+  }
+  void add_promise(std::shared_ptr<std::promise<int32_t>> &promise) {
+    _promises.push_back(promise);
+  }
+ protected:
+  PServerCallBack _callback;
+  std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
+};
+class PsBaseService : public PsService {
+ public:
+  PsBaseService() : _rank(0), _server(NULL), _config(NULL) {}
+  virtual ~PsBaseService() {}
+  virtual int32_t configure(PSServer *server) {
+    _server = server;
+    _rank = _server->rank();
+    _config = _server->config();
+    return 0;
+  }
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const ::paddle::PsRequestMessage *request,
+                       ::paddle::PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override = 0;
+  virtual void set_response_code(PsResponseMessage &response, int err_code,
+                                 const char *err_msg) {
+    response.set_err_msg(err_msg);
+    response.set_err_code(err_code);
+    LOG(WARNING) << "Resonse err_code:" << err_code << " msg:" << err_msg;
+  }
+  virtual int32_t initialize() = 0;
+ protected:
+  size_t _rank;
+  PSServer *_server;
+  const ServerParameter *_config;
+};
+REGISTER_REGISTERER(PsBaseService);
+class PSServerFactory {
+ public:
+  static PSServer *create(const PSParameter &config);
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/distributed/service/service.h"
+#include <fcntl.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include <iostream>
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/string/string_helper.h"
+using namespace std;
+namespace paddle {
+namespace distributed {
+paddle::distributed::PSParameter load_from_prototxt(
+    const std::string& filename) {
+  paddle::distributed::PSParameter param;
+  int file_descriptor = open(filename.c_str(), O_RDONLY);
+  if (file_descriptor == -1) {
+    VLOG(3) << "FATAL: fail to parse " << filename;
+    exit(-1);
+  }
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  if (!google::protobuf::TextFormat::Parse(&fileInput, &param)) {
+    VLOG(3) << "FATAL: fail to parse " << filename;
+    exit(-1);
+  }
+  close(file_descriptor);
+  return param;
+}
+void PSCore::init_gflag(const std::string& gflags) {
+  LOG(INFO) << "Init With Gflags:" << gflags;
+  std::vector<std::string> flags = paddle::string::split_string(gflags);
+  if (flags.size() < 1) {
+    flags.push_back("-max_body_size=314217728");
+    flags.push_back("-bthread_concurrency=40");
+    flags.push_back("-socket_max_unwritten_bytes=2048000000");
+    flags.push_back("-max_connection_pool_size=1950");
+  }
+  auto it = flags.begin();
+  flags.insert(it, "exe default");
+  char* flags_ptr[flags.size()];
+  for (size_t i = 0; i < flags.size(); ++i) {
+    flags_ptr[i] = (char*)(flags[i].c_str());
+  }
+  int params_cnt = flags.size();
+  char** params_ptr = &(flags_ptr[0]);
+  ::google::ParseCommandLineFlags(&params_cnt, &params_ptr, true);
+}
+int PSCore::init_server(const std::string& dist_desc,
+                        const std::vector<std::string>* host_sign_list,
+                        int node_num, int index) {
+  google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
+  init_gflag(_ps_param.init_gflags());
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(host_sign_list, node_num);
+  int ret = 0;
+  _server_ptr = std::shared_ptr<paddle::distributed::PSServer>(
+      paddle::distributed::PSServerFactory::create(_ps_param));
+  ret = _server_ptr->configure(_ps_param, _ps_env, index);
+  CHECK(ret == 0) << "failed to configure server";
+  return ret;
+}
+int PSCore::init_worker(
+    const std::string& dist_desc,
+    const std::map<uint64_t, std::vector<paddle::distributed::Region>>& regions,
+    const std::vector<std::string>* host_sign_list, int node_num, int index) {
+  google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
+  init_gflag(_ps_param.init_gflags());
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(host_sign_list, node_num);
+  int ret = 0;
+  VLOG(1) << "PSCore::init_worker";
+  auto* communicator = Communicator::GetInstance();
+  ret = communicator->GetPsClient()->configure(_ps_param, regions, _ps_env,
+                                               index);
+  communicator->Start();
+  return ret;
+}
+std::vector<uint64_t> PSCore::get_client_info() {
+  return _ps_env.get_client_info();
+}
+int PSCore::create_client2client_connection(int pserver_timeout_ms,
+                                            int pserver_connect_timeout_ms,
+                                            int max_retry) {
+  int ret = _worker_ptr->create_client2client_connection(
+      pserver_timeout_ms, pserver_connect_timeout_ms, max_retry);
+  return ret;
+}
+uint64_t PSCore::run_server(const std::string& ip, uint32_t port) {
+  return _server_ptr->start(ip, port);
+}
+int PSCore::finalize_worker() {
+  _worker_ptr->finalize_worker();
+  return 0;
+}
+int PSCore::stop_server() {
+  auto stop_status = _worker_ptr->stop_server();
+  stop_status.wait();
+  return 0;
+}
+paddle::distributed::PSParameter* PSCore::get_param() { return &_ps_param; }
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/service.h
+++ b/paddle/fluid/distributed/service/service.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <glog/logging.h>
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/server.h"
+namespace paddle {
+namespace distributed {
+class PSCore {
+ public:
+  explicit PSCore() {}
+  virtual ~PSCore() {}
+  virtual int init_server(const std::string& dist_desc,
+                          const std::vector<std::string>* host_sign_list,
+                          int node_num, int index);
+  virtual int init_worker(
+      const std::string& dist_desc,
+      const std::map<uint64_t, std::vector<paddle::distributed::Region>>&
+          regions,
+      const std::vector<std::string>* host_sign_list, int node_num, int index);
+  virtual uint64_t run_server(const std::string& ip, uint32_t port);
+  virtual int stop_server();
+  virtual int finalize_worker();
+  virtual std::vector<uint64_t> get_client_info();
+  virtual int create_client2client_connection(int pserver_timeout_ms,
+                                              int pserver_connect_timeout_ms,
+                                              int max_retry);
+  std::shared_ptr<paddle::distributed::PSServer>
+      _server_ptr;  // pointer to server
+  std::shared_ptr<paddle::distributed::PSClient>
+      _worker_ptr;  // pointer to worker
+  virtual paddle::distributed::PSParameter* get_param();
+ private:
+  void init_gflag(const std::string& gflags);
+  paddle::distributed::PSParameter _ps_param;
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+};
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -16,3 +16,16 @@ cc_test(geo_table_test SRCS geo_table_test.cc DEPS common_table table tensor_acc
 set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+# open it until CI support brpc
+return()
+set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+}
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0 * (float)i;
+}
+void GetDownpourDenseTableProto(
+    ::paddle::distributed::TableParameter* dense_table_proto) {
+  dense_table_proto->set_table_id(0);
+  dense_table_proto->set_table_class("CommonDenseTable");
+  dense_table_proto->set_shard_num(256);
+  dense_table_proto->set_type(::paddle::distributed::PS_DENSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      dense_table_proto->mutable_accessor();
+  ::paddle::distributed::CommonAccessorParameter* common_proto =
+      dense_table_proto->mutable_common();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+  accessor_proto->set_fea_dim(100);
+  accessor_proto->set_embedx_dim(1);
+  common_proto->set_name("sgd");
+  common_proto->set_table_name("MergedDense");
+  common_proto->set_trainer_num(1);
+  common_proto->set_sync(false);
+  common_proto->add_params("Param");
+  common_proto->add_dims(100);
+  common_proto->add_initializers("fill_constant&1.0");
+  common_proto->add_params("LearningRate");
+  common_proto->add_dims(1);
+  common_proto->add_initializers("fill_constant&1.0");
+}
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("PsService");
+  server_service_proto->set_server_class("BrpcPsServer");
+  server_service_proto->set_client_class("BrpcPsClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+  ::paddle::distributed::TableParameter* dense_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourDenseTableProto(dense_table_proto);
+  return server_fleet_desc;
+}
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+  ::paddle::distributed::TableParameter* worker_dense_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourDenseTableProto(worker_dense_table_proto);
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("PsService");
+  server_service_proto->set_server_class("BrpcPsServer");
+  server_service_proto->set_client_class("BrpcPsClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+  ::paddle::distributed::TableParameter* server_dense_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourDenseTableProto(server_dense_table_proto);
+  return worker_fleet_desc;
+}
+/*-------------------------------------------------------------------------*/
+std::string ip_ = "127.0.0.1";
+uint32_t port_ = 4214;
+std::vector<std::string> host_sign_list_;
+std::shared_ptr<paddle::distributed::PSServer> pserver_ptr_;
+std::shared_ptr<paddle::distributed::PSClient> worker_ptr_;
+void RunServer() {
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  LOG(INFO) << "RUN set_ps_servers";
+  _ps_env.set_ps_servers(&host_sign_list_, 1);
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::PSServer>(
+      paddle::distributed::PSServerFactory::create(server_proto));
+  LOG(INFO) << "RUN configure";
+  pserver_ptr_->configure(server_proto, _ps_env, 0);
+  LOG(INFO) << "RUN start";
+  pserver_ptr_->start(ip_, port_);
+  LOG(INFO) << "End start";
+}
+void RunClient(std::map<uint64_t, std::vector<paddle::distributed::Region>>&
+                   dense_regions) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  LOG(INFO) << "Run set_ps_servers";
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  LOG(INFO) << "Run Create PSClient";
+  worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
+      paddle::distributed::PSClientFactory::create(worker_proto));
+  LOG(INFO) << "Run configure";
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+}
+void RunBrpcPushDense() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+  // Srart Server
+  std::thread server_thread(RunServer);
+  sleep(1);
+  // Start Client
+  LOG(INFO) << "Run InitTensorsOnClient";
+  framework::Scope client_scope;
+  platform::CPUPlace place;
+  InitTensorsOnClient(&client_scope, &place, 100);
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  framework::Variable* var = client_scope.FindVar("x");
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+  float* w = tensor->data<float>();
+  paddle::distributed::Region reg(w, tensor->numel());
+  regions.emplace_back(std::move(reg));
+  LOG(INFO) << "Run RunClient";
+  RunClient(dense_regions);
+  /*-----------------------Test Server Init----------------------------------*/
+  LOG(INFO) << "Run pull_dense_param";
+  float* temp = new float[tensor->numel()]();
+  std::vector<paddle::distributed::Region> temp_region;
+  paddle::distributed::Region temp_reg(temp, tensor->numel());
+  temp_region.emplace_back(std::move(temp_reg));
+  auto pull_status =
+      worker_ptr_->pull_dense(temp_region.data(), temp_region.size(), 0);
+  pull_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    EXPECT_FLOAT_EQ(temp[idx], 1.0);
+  }
+  /*-----------------------Test Push Param----------------------------------*/
+  LOG(INFO) << "Run push_dense_param";
+  auto push_status =
+      worker_ptr_->push_dense_param(regions.data(), regions.size(), 0);
+  push_status.wait();
+  pull_status = worker_ptr_->pull_dense(regions.data(), regions.size(), 0);
+  pull_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    EXPECT_FLOAT_EQ(w[idx], float(idx));
+  }
+  /*-----------------------Test Push Grad----------------------------------*/
+  paddle::distributed::DownpourBrpcClosure* closure =
+      new paddle::distributed::DownpourBrpcClosure(1, [&](void* done) {
+        int ret = 0;
+        auto* closure = (paddle::distributed::DownpourBrpcClosure*)done;
+        for (size_t i = 0; i < 1; ++i) {
+          if (closure->check_response(i, paddle::PS_PUSH_DENSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  LOG(INFO) << "Run pull_dense_grad";
+  auto push_grad_status =
+      worker_ptr_->push_dense_raw_gradient(0, temp, tensor->numel(), closure);
+  push_grad_status.wait();
+  auto pull_update_status =
+      worker_ptr_->pull_dense(regions.data(), regions.size(), 0);
+  pull_update_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    EXPECT_FLOAT_EQ(w[idx], float(idx) - 1.0);
+  }
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+  server_thread.join();
+}
+TEST(RunBrpcPushDense, Run) { RunBrpcPushDense(); }
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+}
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("CommonSparseTable");
+  sparse_table_proto->set_shard_num(256);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  ::paddle::distributed::CommonAccessorParameter* common_proto =
+      sparse_table_proto->mutable_common();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+  accessor_proto->set_fea_dim(0);
+  accessor_proto->set_embedx_dim(10);
+  common_proto->set_name("sgd");
+  common_proto->set_table_name("MergedDense");
+  common_proto->set_trainer_num(1);
+  common_proto->set_sync(false);
+  common_proto->add_params("Param");
+  common_proto->add_dims(10);
+  common_proto->add_initializers("uniform_random&0&-1.0&1.0");
+  common_proto->add_params("LearningRate");
+  common_proto->add_dims(1);
+  common_proto->add_initializers("fill_constant&1.0");
+}
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("PsService");
+  server_service_proto->set_server_class("BrpcPsServer");
+  server_service_proto->set_client_class("BrpcPsClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("PsService");
+  server_service_proto->set_server_class("BrpcPsServer");
+  server_service_proto->set_client_class("BrpcPsClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+  return worker_fleet_desc;
+}
+/*-------------------------------------------------------------------------*/
+std::string ip_ = "127.0.0.1";
+uint32_t port_ = 4209;
+std::vector<std::string> host_sign_list_;
+std::shared_ptr<paddle::distributed::PSServer> pserver_ptr_;
+std::shared_ptr<paddle::distributed::PSClient> worker_ptr_;
+void RunServer() {
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 1);
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::PSServer>(
+      paddle::distributed::PSServerFactory::create(server_proto));
+  pserver_ptr_->configure(server_proto, _ps_env, 0);
+  pserver_ptr_->start(ip_, port_);
+}
+void RunClient(std::map<uint64_t, std::vector<paddle::distributed::Region>>&
+                   dense_regions) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
+      paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+}
+void RunBrpcPushSparse() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+  // Srart Server
+  std::thread server_thread(RunServer);
+  sleep(1);
+  // Start Client
+  framework::Scope client_scope;
+  platform::CPUPlace place;
+  InitTensorsOnClient(&client_scope, &place, 100);
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  framework::Variable* var = client_scope.FindVar("x");
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+  RunClient(dense_regions);
+  std::vector<uint64_t> fea_keys(10);
+  std::vector<float> fea_values(100);
+  std::vector<float> fea_temp_values(100);
+  std::vector<float*> fea_value_ptr(10);
+  std::vector<float*> fea_temp_value_ptr(10);
+  for (size_t idx = 0; idx < fea_keys.size(); ++idx) {
+    fea_keys[idx] = (uint64_t)idx;
+    fea_value_ptr[idx] = fea_values.data() + idx * 10;
+    fea_temp_value_ptr[idx] = fea_temp_values.data() + idx * 10;
+  }
+  /*-----------------------Test Server Init----------------------------------*/
+  LOG(INFO) << "Run pull_sparse_param";
+  auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0,
+                                              fea_keys.data(), fea_keys.size());
+  pull_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    fea_values.data()[idx] *= 2.0;
+  }
+  /*-----------------------Test Push Param----------------------------------*/
+  LOG(INFO) << "Run push_sparse_param";
+  paddle::distributed::DownpourBrpcClosure* closure_push_param =
+      new paddle::distributed::DownpourBrpcClosure(1, [&](void* done) {
+        int ret = 0;
+        auto* closure = (paddle::distributed::DownpourBrpcClosure*)done;
+        for (size_t i = 0; i < 1; ++i) {
+          if (closure->check_response(i, paddle::PS_PUSH_SPARSE_PARAM) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto push_status = worker_ptr_->push_sparse_param(
+      0, fea_keys.data(), (const float**)fea_value_ptr.data(), fea_keys.size(),
+      closure_push_param);
+  push_status.wait();
+  auto pull_param_status = worker_ptr_->pull_sparse(
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+  pull_param_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    EXPECT_FLOAT_EQ(fea_temp_values[idx], fea_values[idx]);
+  }
+  /*-----------------------Test Push Grad----------------------------------*/
+  paddle::distributed::DownpourBrpcClosure* closure_push_grad =
+      new paddle::distributed::DownpourBrpcClosure(1, [&](void* done) {
+        int ret = 0;
+        auto* closure = (paddle::distributed::DownpourBrpcClosure*)done;
+        for (size_t i = 0; i < 1; ++i) {
+          if (closure->check_response(i, paddle::PS_PUSH_SPARSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  LOG(INFO) << "Run pull_sparse_grad";
+  std::vector<float*> push_g_vec;
+  for (auto i = 0; i < static_cast<int>(fea_keys.size()); ++i) {
+    push_g_vec.push_back(tensor->data<float>() + i * 10);
+  }
+  auto push_grad_status = worker_ptr_->push_sparse_raw_gradient(
+      0, fea_keys.data(), (const float**)push_g_vec.data(), fea_keys.size(),
+      closure_push_grad);
+  push_grad_status.wait();
+  auto pull_update_status = worker_ptr_->pull_sparse(
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+  pull_update_status.wait();
+  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+    EXPECT_FLOAT_EQ(fea_temp_values[idx], fea_values[idx] - 1.0);
+  }
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+  server_thread.join();
+}
+TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); }
--- a/paddle/fluid/distributed/test/heter_serde_test.cc
+++ b/paddle/fluid/distributed/test/heter_serde_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/distributed/service/heter_serde.h"
+#include "paddle/fluid/distributed/service/brpc_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"

--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -109,7 +109,7 @@ TEST(SparseGeoTable, SSUM) {
      auto id = geo_pull_ids[i][j];
      for (int k = 0; k < emb_dim; k++) {
        ASSERT_TRUE(abs(geo_pull_values[i][j * emb_dim + k] -
-                        pull_values[id * emb_dim + k]) < 1e-6);
+                        pull_values[id * emb_dim + k]) < 1e-5);
      }
    }
  }

--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
@@ -103,7 +103,7 @@ TEST(CommonSparseTable, SGD) {
  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
  for (size_t i = 0; i < init_values.size(); ++i) {
    auto update_val = init_values[i] - 1.0 * total_gradients[i];
-    ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-6);
+    ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5);
  }
 }