[Feature] one ps (3/4) (#29604)

* oneps (3/4) Co-authored-by: N MrChengmo <cmchengmo@163.com> Co-authored-by: N malin10 <malin10@baidu.com> Co-authored-by: N chengmo <chengmo@baidu.com>

[Feature] one ps (3/4) (#29604)
* oneps (3/4) Co-authored-by: N MrChengmo <cmchengmo@163.com> Co-authored-by: N malin10 <malin10@baidu.com> Co-authored-by: N chengmo <chengmo@baidu.com>
032414ca · tangwei12 · GitHub · edc06c6a · 032414ca · 032414ca
121 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,17 +246,6 @@ endif()

 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies

-if(WITH_DISTRIBUTE)
-    if(WITH_GRPC)
-        message(STATUS "Use grpc framework.")
-        include(external/grpc)
-    else()
-        message(STATUS "Use brpc framework.")
-        include(external/leveldb)
-        include(external/brpc)
-    endif()
-endif()
-
 include(flags)              # set paddle compile flags

 if(WITH_PROFILER)

--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -33,15 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})

 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")

 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
        extern_brpc
        ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  "${GIT_URL}/apache/incubator-brpc.git"
-    GIT_TAG         "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now.
+        # TODO(gongwb): change to de newst repo when they changed.
+        GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
+        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
        PREFIX          ${BRPC_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -63,9 +63,13 @@ ExternalProject_Add(
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
+# ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)

 add_definitions(-DBRPC_WITH_GLOG)
+
+LIST(APPEND external_project_dependencies brpc)
+
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -23,10 +23,10 @@ INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
 ExternalProject_Add(
        extern_leveldb
        ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
        PREFIX ${LEVELDB_SOURCES_DIR}
-    GIT_REPOSITORY "${GIT_URL}/google/leveldb.git"
+        GIT_REPOSITORY "https://github.com/google/leveldb"
        GIT_TAG v1.18
+        UPDATE_COMMAND ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
        INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/
@@ -35,6 +35,11 @@ ExternalProject_Add(
        BUILD_IN_SOURCE 1
 )

+ADD_DEPENDENCIES(extern_leveldb snappy)
+
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
+
+LIST(APPEND external_project_dependencies leveldb)
+
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include (ExternalProject)
+
+# NOTE: snappy is needed when linking with recordio
+
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
+ExternalProject_Add(
+        extern_snappy
+        GIT_REPOSITORY "https://github.com/google/snappy"
+        GIT_TAG "1.1.7"
+        PREFIX          ${SNAPPY_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+        -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
+        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DBUILD_TESTING=OFF
+        -DSNAPPY_BUILD_TESTS:BOOL=OFF
+        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+        add_custom_command(TARGET extern_snappy POST_BUILD
+                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
+                )
+    ENDIF()
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
+
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
+
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
+
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -95,7 +95,7 @@ include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
 if(NOT APPLE)
  find_package(Threads REQUIRED)
  link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  if(WITH_PSLIB)
+  if(WITH_PSLIB OR WITH_DISTRIBUTE)
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
  else()
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -233,7 +233,7 @@ if(WITH_PYTHON)
    list(APPEND third_party_deps extern_pybind)
 endif()

-IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
+IF(WITH_TESTING OR WITH_DISTRIBUTE)
    include(external/gtest)     # download, build, install gtest
    list(APPEND third_party_deps extern_gtest)
 ENDIF()
@@ -275,14 +275,18 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

-if(WITH_DISTRIBUTE)
+if (WITH_DISTRIBUTE)
+    include(external/snappy)
+    list(APPEND third_party_deps extern_snappy)

-    if(WITH_GRPC)
-        list(APPEND third_party_deps extern_grpc)
-    else()
+    include(external/leveldb)
    list(APPEND third_party_deps extern_leveldb)
+        
+    include(external/brpc)
    list(APPEND third_party_deps extern_brpc)
-    endif()
+
+    include(external/libmct)     # download, build, install libmct
+    list(APPEND third_party_deps extern_libmct)
 endif()

 if(WITH_XBYAK)

--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -14,14 +14,9 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()

-
 add_subdirectory(table)
-add_subdirectory(test)
-
-# open it until CI support brpc
-return()
-
 add_subdirectory(service)
+add_subdirectory(test)

 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)


--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -35,6 +35,6 @@ cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
 cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
 cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS})

-cc_library(brpc_utils SRCS brpc_utils.cc DEPS ${COMMON_DEPS} ${RPC_DEPS})
+cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -741,7 +741,7 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
      request_call_num, [shard_sorted_kvs, value_size](void *done) {
        int ret = 0;
        auto *closure = (DownpourBrpcClosure *)done;
-        for (size_t i = 0; i < ids.size(); ++i) {
+        for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
          if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
            ret = -1;
            break;

--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -839,7 +839,7 @@ void GeoCommunicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {

  for (auto &iter : send_varname_to_ctx_) {
    auto &ctx = iter.second;
-    if (!ctx.is_sparse) return;
+    if (!ctx.is_sparse) continue;
    auto &varname = ctx.origin_varnames[0];
    auto &table_id = ctx.table_id;
    auto param = varname.substr(0, varname.size() - 5);
@@ -853,12 +853,12 @@ void GeoCommunicator::InitDense(std::vector<std::string> &varnames,
  if (trainer_id_ == 0) {
    RpcSendDenseParam(varnames, table_id, *recv_scope_);
    BarrierWithTable(1);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "push dense param to table " << table_id
            << " from 0' trainer done";
  } else {
    BarrierWithTable(1);
    RpcRecvDense(varnames, table_id, recv_scope_);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "pull dense param to table " << table_id
            << " from 0' trainer done";
  }

@@ -952,20 +952,20 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
 }

 void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) {
-  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
+  VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
  if (trainer_id_ == 0) {
    RpcSendSparseParam(var_name, table_id, *recv_scope_);
    BarrierWithTable(1);
-    VLOG(0) << "push sparse param to table " << table_id
+    VLOG(1) << "push sparse param to table " << table_id
            << " from 0' trainer done";
  } else {
    BarrierWithTable(1);
    RpcRecvSparse(var_name, table_id, recv_scope_);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "pull sparse param to table " << table_id
            << " from 0' trainer done";
  }

-  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " done.";
+  VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " done.";
  auto *global_var = recv_scope_->FindVar(var_name);
  auto *var = old_scope_->Var(var_name);
  framework::CopyVariable(*global_var, var);

--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -24,11 +24,11 @@
 #include "paddle/fluid/platform/timer.h"

 DECLARE_int32(rpc_deadline);
+DECLARE_int32(pserver_timeout_ms);
+
 namespace paddle {
 namespace distributed {

-DEFINE_int32(pserver_timeout_ms, 10800000, "pserver request server timeout_ms");
-
 std::shared_ptr<HeterClient> HeterClient::s_instance_ = NULL;
 bool HeterClient::is_initialized_ = false;

@@ -53,6 +53,23 @@ void HeterClient::Stop() {
  }
 }

+void HeterClient::FinalizeWorker() {
+  running_ = false;
+  if (!is_initialized_) {
+    VLOG(0) << "HeterClient is not inited, do nothing";
+  } else {
+    if (main_thread_) {
+      main_thread_->join();
+      main_thread_.reset(nullptr);
+    }
+    VLOG(1) << "HeterClient Stop Done";
+  }
+}
+
+std::future<int32_t> HeterClient::StopHeterWorker() {
+  return SendCmd(-1, PS_STOP_SERVER, {});
+}
+
 void HeterClient::RpcProfilerControl() {
  if (trainer_id_ == 0) {
    if (!do_server_profiler_ && platform::IsProfileEnabled()) {
@@ -73,7 +90,7 @@ void HeterClient::CreateClient2XpuConnection() {
  brpc::ChannelOptions options;
  options.protocol = "baidu_std";
  options.connection_type = "single";
-  options.timeout_ms = pserver_timeout_ms;
+  options.timeout_ms = FLAGS_pserver_timeout_ms;

  xpu_channels_.resize(xpu_list_.size());
  for (size_t i = 0; i < xpu_list_.size(); ++i) {
@@ -102,7 +119,7 @@ void HeterClient::SendAndRecvAsync(
  int num = trainer_id_ % xpu_channels_.size();

  brpc::Controller cntl;
-  cntl.set_timeout_ms(pserver_timeout_ms);
+  cntl.set_timeout_ms(FLAGS_pserver_timeout_ms);
  distributed::MultiVarMsg request, response;
  auto& request_io_buffer = cntl.request_attachment();
  ::paddle::PsService_Stub stub(xpu_channels_[num].get());
@@ -149,7 +166,7 @@ std::future<int32_t> HeterClient::SendCmd(
    }
    ::paddle::PsService_Stub rpc_stub(xpu_channels_[i].get());
    closure->cntl(i)->set_timeout_ms(
-        pserver_timeout_ms);  // cmd msg don't limit timeout for save/load
+        FLAGS_pserver_timeout_ms);  // cmd msg don't limit timeout for save/load
    rpc_stub.service(closure->cntl(i), closure->request(i),
                     closure->response(i), closure);
  }

--- a/paddle/fluid/distributed/service/heter_client.h
+++ b/paddle/fluid/distributed/service/heter_client.h
@@ -42,7 +42,7 @@ typedef std::function<void(void*)> HeterRpcCallbackFunc;

 class OnHeterRpcDone : public google::protobuf::Closure {
 public:
-  OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
+  explicit OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
  virtual ~OnHeterRpcDone() {}
  void Run() {
    std::unique_ptr<OnHeterRpcDone> self_guard(this);
@@ -79,7 +79,6 @@ class HeterClient {
    if (NULL == s_instance_) {
      is_initialized_ = true;
      s_instance_.reset(new paddle::distributed::HeterClient());
-      std::vector<std::string> xpu_list = {endpoint};
      s_instance_->SetXpuList(endpoint);
      s_instance_->SetTrainerID(trainer_id);
      s_instance_->CreateClient2XpuConnection();
@@ -89,6 +88,8 @@ class HeterClient {

  void Stop();

+  void FinalizeWorker();
+
  void MainThread();

  void RpcProfilerControl();
@@ -97,6 +98,7 @@ class HeterClient {
                               const std::vector<std::string>& params);

  std::future<int32_t> StartProfiler();
+
  std::future<int32_t> StopProfiler();
  std::future<int32_t> StopHeterWorker();

@@ -104,17 +106,16 @@ class HeterClient {

  void SetXpuList(const std::vector<std::string>& xpu_list) {
    xpu_list_ = xpu_list;
-  };
+  }

  void SetTrainerID(const int& trainer_id) { trainer_id_ = trainer_id; }

 private:
  static std::shared_ptr<HeterClient> s_instance_;
-
- protected:
  static bool is_initialized_;
  std::unique_ptr<std::thread> main_thread_{nullptr};
  std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
+
  DISABLE_COPY_AND_ASSIGN(HeterClient);
  std::vector<std::string> xpu_list_;


--- a/paddle/fluid/distributed/service/heter_server.cc
+++ b/paddle/fluid/distributed/service/heter_server.cc
@@ -45,7 +45,11 @@ void HeterServer::StartHeterService() {
  }
  condition_ready_.notify_all();

-  server_.Join();
+  std::unique_lock<std::mutex> running_lock(mutex_);
+  cv_.wait(running_lock, [&] {
+    VLOG(1) << "Heter Server is Stop? " << stoped_;
+    return stoped_;
+  });
 }

 void HeterServer::SetEndPoint(std::string& endpoint) {
@@ -83,6 +87,7 @@ int32_t HeterService::stop_heter_worker(const PsRequestMessage& request,
  stop_cpu_worker_set_.insert(client_id);
  if (stop_cpu_worker_set_.size() == fan_in_) {
    is_exit_ = true;
+    VLOG(0) << "Stop heter Service done.";
  }
  return 0;
 }

--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "brpc/channel.h"
 #include "brpc/controller.h"
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/profiler.h"

+DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {
 namespace distributed {

@@ -82,7 +84,7 @@ class HeterService : public ::paddle::PsService {
      response->set_err_code(service_ret);
      response->set_err_msg("server internal error");
    }
-  };
+  }

  void SendAndRecvVariable(::google::protobuf::RpcController* controller,
                           const MultiVarMsg* request, MultiVarMsg* response,
@@ -134,6 +136,10 @@ class HeterServer {
  virtual ~HeterServer() {}

  void Stop() {
+    VLOG(0) << "HeterServer Stop()";
+    std::unique_lock<std::mutex> lock(mutex_);
+    stoped_ = true;
+    cv_.notify_all();
    server_.Stop(1000);
    server_.Join();
  }
@@ -162,6 +168,10 @@ class HeterServer {

 private:
  static std::shared_ptr<HeterServer> s_instance_;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  std::condition_variable condition_ready_;
+  bool stoped_ = false;
  std::string endpoint_;

 protected:
@@ -169,7 +179,7 @@ class HeterServer {
  HeterService service_;
  DISABLE_COPY_AND_ASSIGN(HeterServer);
  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
+
  int ready_;
 };

@@ -215,6 +225,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler {
  int Handle(const MultiVarMsg* request, MultiVarMsg* response,
             brpc::Controller* cntl) override {
    platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle");
+    FLAGS_eager_delete_tensor_gb = -1;
    auto& local_scope = scope_->NewScope();
    auto message_name = request->message_name();
    auto& request_io_buffer = cntl->request_attachment();

--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -60,6 +60,8 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
  _environment = &env;
  _shuffled_ins =
      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
+  size_t shard_num = env.get_ps_servers().size();
+
  const auto &downpour_param = _config.downpour_server_param();

  uint32_t barrier_table = UINT32_MAX;
@@ -72,6 +74,7 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
        "BarrierTable") {
      barrier_table = downpour_param.downpour_table_param(i).table_id();
    }
+    table->set_shard(_rank, shard_num);
    table->initialize(downpour_param.downpour_table_param(i),
                      config.fs_client_param());
    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);

--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -12,8 +12,7 @@ cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
-cc_library(tensor_table SRCS tensor_table.cc DEPS ps_framework_proto proto_desc enforce executor tensor device_context simple_threadpool gflags glog )

 set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

-cc_library(table SRCS table.cc DEPS common_table tensor_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
+cc_library(table SRCS table.cc DEPS common_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -251,6 +251,30 @@ int32_t CommonSparseTable::initialize_value() {
    auto shard = std::make_shared<ValueBlock>(common, &initializers_);
    shard_values_.emplace_back(shard);
  }
+
+  auto accessor = _config.accessor();
+
+  std::vector<uint64_t> feasigns;
+
+  for (size_t x = 0; x < accessor.fea_dim(); ++x) {
+    if (x % _shard_num == _shard_idx) {
+      feasigns.push_back(x);
+    }
+  }
+
+  VLOG(0) << "has " << feasigns.size() << " ids need to be pre inited";
+
+  auto buckets = bucket(feasigns.size(), 10);
+  for (int x = 0; x < 10; ++x) {
+    auto bucket_feasigns = buckets[x + 1] - buckets[x];
+    std::vector<uint64_t> ids(bucket_feasigns);
+    std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
+              ids.begin());
+    std::vector<float> pulls;
+    pulls.resize(bucket_feasigns * param_dim_);
+    pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
+  }
+
  return 0;
 }


--- a/paddle/fluid/distributed/table/depends/initializers.h
+++ b/paddle/fluid/distributed/table/depends/initializers.h
@@ -34,6 +34,18 @@ class Initializer {

  virtual float GetValue() = 0;

+  virtual void GetValue(std::vector<float> *values, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      values->push_back(GetValue());
+    }
+  }
+
+  virtual void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = GetValue();
+    }
+  }
+
  virtual ~Initializer() {}

 protected:
@@ -54,6 +66,11 @@ class UniformInitializer : public Initializer {
  }

  float GetValue() override { return dist_(*random_engine_); }
+  void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = dist_(*random_engine_);
+    }
+  }

 private:
  float min_;
@@ -77,6 +94,11 @@ class GaussianInitializer : public Initializer {
  }

  float GetValue() override { return dist_(*random_engine_); }
+  void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = dist_(*random_engine_);
+    }
+  }

 private:
  float std_;
@@ -94,6 +116,7 @@ class FillConstantInitializer : public Initializer {
  }

  float GetValue() override { return value_; }
+  void GetValue(float *value, int numel) { std::fill_n(value, numel, value_); }

 private:
  float value_;

--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -68,7 +68,7 @@ inline bool entry<float>(const int count, const float threshold) {

 struct VALUE {
  explicit VALUE(const std::vector<std::string> &names)
-      : names_(names), count_(0), unseen_days_(0) {
+      : names_(names), count_(1), unseen_days_(0), seen_after_last_save_(true) {
    values_.resize(names.size());
    for (int i = 0; i < static_cast<int>(names.size()); i++) {
      places[names[i]] = i;
@@ -79,6 +79,14 @@ struct VALUE {
    values_ = std::move(*values);
  }

+  void set(const std::vector<Initializer *> &inits, std::vector<int> numels) {
+    for (int x = 0; x < numels.size(); ++x) {
+      auto &value = values_[x];
+      value.resize(numels[x]);
+      inits[x]->GetValue(value.data(), numels[x]);
+    }
+  }
+
  void set(const std::vector<std::string> &names,
           const std::vector<std::vector<float>> &values) {
    for (int i = 0; i < static_cast<int>(names.size()); i++) {
@@ -117,8 +125,8 @@ struct VALUE {

  std::vector<std::string> names_;
  int count_;
-  bool seen_after_last_save_;
  int unseen_days_;
+  bool seen_after_last_save_;
  bool is_entry_;
  std::vector<std::vector<float>> values_;
  std::unordered_map<std::string, int> places;
@@ -139,15 +147,20 @@ class ValueBlock {
      value_dims_.push_back(dim);
    }

+    for (auto &name : value_names_) {
+      initializer_list_.emplace_back(initializers_->at(name));
+    }
+
    // for Entry
    {
      // entry will add later
      std::string entry_attr = "none";
-
      if (entry_attr == "none") {
+        has_entry = false;
        entry_func_ =
            std::bind(entry<std::string>, std::placeholders::_1, "none");
      } else {
+        has_entry = true;
        auto slices = string::split_string<std::string>(entry_attr, "&");
        if (slices[0] == "count_filter") {
          int threshold = std::stoi(slices[1]);
@@ -181,6 +194,22 @@ class ValueBlock {
    values_[id] = value;
  }

+  void Init(const uint64_t &id, const std::vector<Initializer *> &inits,
+            int count) {
+    if (Has(id)) {
+      PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
+    }
+
+    if (inits.size() != value_names_.size()) {
+      PADDLE_THROW(
+          platform::errors::AlreadyExists("values can not match, error"));
+    }
+
+    auto value = new VALUE(value_names_);
+    value->set(inits, value_dims_);
+    values_[id] = value;
+  }
+
  std::vector<std::vector<float> *> Get(
      const uint64_t &id, const std::vector<std::string> &value_names) {
    auto ret_values = values_.at(id)->get(value_names);
@@ -195,27 +224,12 @@ class ValueBlock {
  void InitFromInitializer(const uint64_t &id,
                           const std::vector<std::string> &value_names) {
    if (Has(id)) {
+      if (has_entry) {
        Update(id);
-      return;
-    }
-
-    auto rets = std::vector<std::vector<float>>();
-    rets.resize(value_names_.size());
-
-    for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
-      auto name = value_names_[i];
-      auto *init = initializers_->at(name);
-
-      auto dim = value_dims_[i];
-      rets[i].resize(dim);
-
-      for (int j = 0; j < static_cast<int>(dim); j++) {
-        rets[i][j] = init->GetValue();
      }
+      return;
    }
-
-    Init(id, &rets, 0);
-    Update(id);
+    Init(id, initializer_list_, 1);
  }

  bool GetEntry(const uint64_t &id) {
@@ -254,10 +268,12 @@ class ValueBlock {
  std::unordered_map<uint64_t, VALUE *> values_;

 private:
+  bool has_entry = false;
  std::vector<std::string> value_names_;
  std::vector<int> value_dims_;
  std::function<bool(uint64_t)> entry_func_;
  std::unordered_map<std::string, Initializer *> *initializers_;
+  std::vector<Initializer *> initializer_list_;
 };

 }  // namespace distributed

--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -22,14 +22,12 @@
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
-#include "paddle/fluid/distributed/table/tensor_table.h"

 namespace paddle {
 namespace distributed {

 REGISTER_CLASS(Table, CommonDenseTable);
 REGISTER_CLASS(Table, CommonSparseTable);
-REGISTER_CLASS(Table, DenseTensorTable);
 REGISTER_CLASS(Table, SparseGeoTable);
 REGISTER_CLASS(Table, BarrierTable);


--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
-if(APPLE)
-    return()
-endif()
-
 set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

 set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

-set_source_files_properties(sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(sparse_table_test SRCS sparse_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
-
-set_source_files_properties(geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(geo_table_test SRCS geo_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
-
 set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

-
-# open it until CI support brpc
-return()
-
 set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})


--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -120,7 +120,7 @@ TEST(CommonDenseTable, Adam) {
    beta2_pow[0] *= beta2;
  }
  for (int j = 0; j < fea_dim; j++) {
-    ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-6);
+    ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-5);
  }
 }


--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -62,7 +62,7 @@ TEST(SparseGeoTable, SSUM) {
  std::vector<float> pull_values(init_values.size());
  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
-    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-6);
+    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
  }

  std::vector<std::vector<uint64_t>> trainer_keys;

--- a/paddle/fluid/distributed/test/large_scale_test.cc
+++ b/paddle/fluid/distributed/test/large_scale_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ThreadPool.h>
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+namespace paddle {
+namespace distributed {
+
+TEST(BENCHMARK, LargeScaleKV) {
+  int emb_dim = 10;
+  int trainers = 2;
+  float beta1 = 0.9;
+  float beta2 = 0.999;
+  float epsilon = 1.0e-8;
+
+  TableParameter table_config;
+  table_config.set_table_class("CommonSparseTable");
+  FsClientParameter fs_config;
+  Table *table = new CommonSparseTable();
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CommMergeAccessor");
+  CommonAccessorParameter *common_config = table_config.mutable_common();
+  common_config->set_name("adam");
+  common_config->set_table_name("adam_test_table");
+  common_config->set_trainer_num(trainers);
+  common_config->add_params("Param");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("uniform_random&0&-1.0&1.0");
+  common_config->add_params("LearningRate");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  common_config->add_params("Moment1");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&0.0");
+  common_config->add_params("Moment2");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&0.0");
+  common_config->add_params("Beta1Pow");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  common_config->add_params("Beta2Pow");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+}
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -218,16 +218,16 @@ if(WITH_DISTRIBUTE)
    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
            dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
            heterxpu_trainer.cc
-    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-    heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+            data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
+            heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
            pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-    device_context scope framework_proto trainer_desc_proto glog fs shell
-    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
-    lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
-    graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto)
+            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
+            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  endif()
 elseif(WITH_PSLIB)
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
@@ -239,11 +239,7 @@ elseif(WITH_PSLIB)
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor pslib_brpc )
-  # TODO: Fix these unittest failed on Windows
-  # This unittest will always failed, now no CI will run this unittest
-  if(NOT WITH_MUSL AND NOT WIN32)
-    cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
-  endif()
+
 else()
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -254,11 +250,6 @@ else()
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor)
-  # TODO: Fix these unittest failed on Windows
-  # This unittest will always failed, now no CI will run this unittest
-  if(NOT WITH_MUSL AND NOT WIN32)
-    cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
-  endif()
 endif()

 target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,10 +15,10 @@ cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_he
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

 if(WITH_DISTRIBUTE)
-    if(NOT WITH_GRPC)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    endif()
+    set_source_files_properties(threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 endif()


@@ -36,7 +36,7 @@ if(WITH_GPU)

    if(WITH_DISTRIBUTE)
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
+                ddim dynload_cuda selected_rows_functor)
    else()
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim dynload_cuda selected_rows_functor)
@@ -52,7 +52,7 @@ else()
            variable_visitor place device_memory_aligment)
    if(WITH_DISTRIBUTE)
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_rpc)
+                ddim selected_rows_functor)
    else()
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim selected_rows_functor)
@@ -85,9 +85,7 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)

 set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
-if(WITH_DISTRIBUTE)
-    list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator)
-endif()
+
 cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})

 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory

--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/variable_helper.h"

 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/distributed/service/communicator.h"
 #endif

 namespace paddle {
@@ -43,40 +43,7 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
 }

 // get CommContext and remote send and recv op
-void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-  bool need_communicator = false;
-
-  for (auto &node : graphs[0]->Nodes()) {
-    VLOG(3) << "node name " << node->Name();
-    if (node && node->IsOp()) {
-      if (node->Name() == "send") {
-        auto send_varnames =
-            BOOST_GET_CONST(std::vector<std::string>,
-                            node->Op()->GetNullableAttr("send_varnames"));
-
-        if (send_varnames.size() > 0) {
-          need_communicator = true;
-          break;
-        }
-      }
-    }
-  }
-
-  if (need_communicator) {
-    // init communicator here
-    auto *instance = operators::distributed::Communicator::GetInstance();
-    auto initialized = instance ? true : false;
-    PADDLE_ENFORCE_EQ(initialized, true,
-                      platform::errors::InvalidArgument(
-                          "Communicator is not Initialized, you may use "
-                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
-                          "develop/markdown_doc/transpiler)"));
-  }
-
-#endif
-}
+void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { return; }

 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
@@ -171,12 +138,12 @@ FetchResultType AsyncSSAGraphExecutor::Run(
                        "results to be fetched!"));
  // init once
  if (run_futures_.size() == 0 && places_.size() > 1) {
-    if (strategy_.thread_barrier_) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
+    if (strategy_.thread_barrier_) {
+      paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
          places_.size());
-#endif
    }
+#endif
    exception_holder_.Clear();
    StartOffPythonTrainLoop(return_merged);
  }

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -19,11 +19,6 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#endif
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -51,106 +46,6 @@ void ReduceOpHandle::Wait(
  }
 }

-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-template <typename DevCtx, typename DataType>
-void ReduceOpHandle::GatherSelectedRows(
-    const std::vector<const SelectedRows *> &src_selected_rows,
-    const std::vector<platform::Place> &in_places,
-    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-    VarHandle *out_var_handle, const platform::Place &out_place,
-    SelectedRows *dst_selected_rows) {
-  const CollectiveContext &collective_context =
-      *CollectiveContext::GetInstance();
-
-  // 1. gather local selected rows, merge them
-  std::string gathered_var_name = out_var_handle->name() + "_gathered_tmp";
-  auto scope = local_scopes_.at(out_var_handle->scope_idx());
-  auto gathered_var_mid = scope->Var(gathered_var_name);
-  auto gathered_select_rows =
-      gathered_var_mid->GetMutable<framework::SelectedRows>();
-  GatherLocalSelectedRowsFunctor functor(
-      src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
-  WaitInputVarGenerated();
-  functor();
-
-  // FIXME(gongwb): remove this Wait.
-  Wait(dev_ctxes);
-
-  // merge them
-  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
-  std::string merged_var_name =
-      GetRemoteVarName(out_var_handle->name(), collective_context.trainer_id_);
-  auto merged_select_rows =
-      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
-  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
-  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
-
-  // 2. start collective server if it doesn't exist
-  operators::distributed::CollectiveServer *server =
-      operators::distributed::CollectiveServer::GetInstance(
-          collective_context.endpoints_[collective_context.trainer_id_],
-          collective_context.endpoints_.size() - 1);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar(merged_var_name,
-                          operators::distributed::kRequestGetMonomerVariable,
-                          scope, merged_dev_ctx);
-
-  // 3. gather them from all remote nodes.
-  std::vector<const SelectedRows *> remote;
-  operators::distributed::CollectiveClient *client =
-      operators::distributed::CollectiveClient::GetInstance();
-
-  std::vector<operators::distributed::RemoteVar> vars;
-  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
-    if (i == (unsigned)collective_context.trainer_id_) continue;
-
-    operators::distributed::RemoteVar var;
-    var.trainer_id_ = i;
-    var.var_name_ = GetRemoteVarName(out_var_handle->name(), i);
-    var.ep_ = collective_context.endpoints_[i];
-
-    vars.push_back(var);
-    VLOG(4) << "gather from:" << var.String();
-  }
-
-  // erase gathered vars
-  merged_dev_ctx->Wait();
-  scope->EraseVars(std::vector<std::string>{gathered_var_name});
-
-  PADDLE_ENFORCE_EQ(
-      client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
-      platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
-  PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
-                    platform::errors::PreconditionNotMet(
-                        "The number of remotes should be equal to the number "
-                        "of variables to be gathered, but got the number of "
-                        "remotes is %d and the number of variables is %d.",
-                        remote.size(), vars.size()));
-
-  // 4. merged local selected rows.
-  std::vector<const SelectedRows *> all;
-  all.resize(collective_context.endpoints_.size());
-  for (auto v : vars) {
-    all[v.trainer_id_] =
-        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
-  }
-  all[collective_context.trainer_id_] = merged_select_rows;
-
-  merge_func(*merged_dev_ctx, all, dst_selected_rows);
-
-  rpc_server->WaitVarBarrier(merged_var_name);
-  rpc_server->ClearVar(merged_var_name);
-
-  // 5. clear mid vars
-  std::vector<std::string> tmp_vars{merged_var_name};
-  for (auto r : vars) {
-    tmp_vars.push_back(r.var_name_);
-  }
-  scope->EraseVars(tmp_vars);
-}
-#endif
-
 void ReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());

@@ -241,25 +136,6 @@ void ReduceOpHandle::RunImpl() {
        functor();
        return;
      }
-
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-      if (in_selected_rows[0]->value().type() ==
-          framework::proto::VarType::FP32) {
-        GatherSelectedRows<platform::CUDADeviceContext, float>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else if (in_selected_rows[0]->value().type() ==
-                 framework::proto::VarType::FP64) {
-        GatherSelectedRows<platform::CUDADeviceContext, double>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Only support double or float when gather SelectedRows, but got "
-            "%s.",
-            framework::DataTypeToString(in_selected_rows[0]->value().type())));
-      }
-#endif
    });
  } else {
    std::vector<const LoDTensor *> lod_tensors =

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/platform/profiler.h"

 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/distributed/service/communicator.h"
 #endif

 namespace paddle {
@@ -362,14 +362,11 @@ void ThreadedSSAGraphExecutor::ExecutionFinal(
    std::vector<OpHandleBase *> *fetch_ops) {
 #ifdef PADDLE_WITH_DISTRIBUTE
  if (strategy_.thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
-
  VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
  ClearFetchOp(graph_, fetch_ops);
-
  exception_holder_.ReThrow();
 }


--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -91,13 +90,13 @@ Executor::~Executor() {
 }

 void Executor::Close() {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  // TODO(typhoonzero): complete message will need to use real trainer_id,
-  // except 0.
-  auto client =
-      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  client->SendComplete();
-#endif
+  // #ifdef PADDLE_WITH_DISTRIBUTE
+  //   // TODO(typhoonzero): complete message will need to use real trainer_id,
+  //   // except 0.
+  //   auto client =
+  //       paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  //   client->SendComplete();
+  // #endif
 }

 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,

--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -16,10 +16,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"

+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif
+
 namespace paddle {
 namespace framework {

@@ -185,8 +188,7 @@ void HogwildWorker::TrainFilesWithProfiler() {

 #ifdef PADDLE_WITH_DISTRIBUTE
  if (thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
 }
@@ -216,8 +218,7 @@ void HogwildWorker::TrainFiles() {
  }
 #ifdef PADDLE_WITH_DISTRIBUTE
  if (thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
 }

--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -48,7 +51,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,

 #ifdef PADDLE_WITH_DISTRIBUTE
  if (trainer_desc.thread_barrier()) {
-    operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
        thread_num_);
  }
 #endif

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -77,8 +77,13 @@ set(SHARED_INFERENCE_SRCS
    ${mkldnn_quantizer_src_file})

 # Create shared inference library defaultly
-cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+if(NOT WITH_DISTRIBUTE)
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
      DEPS ${fluid_modules} analysis_predictor)
+else()
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+      DEPS ${fluid_modules} analysis_predictor fleet ps_service)
+endif()

 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
 #!/bin/sh

+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi

-num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
+num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
+num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -v brpc | grep -c "T " )

 if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
 if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -20,9 +20,9 @@ add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(jit)

+
 if(WITH_DISTRIBUTE)
-    add_subdirectory(distributed)
-    add_subdirectory(distributed_ops)
+    add_subdirectory(pscore)
    add_subdirectory(collective)
 endif()

@@ -50,10 +50,6 @@ if (WITH_GPU)
    endif()
 endif()

-SET(OP_PREFETCH_DEPS "")
-if (WITH_DISTRIBUTE)
-    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
-endif()

 SET(OP_MKL_DEPS "")
 if (NOT WITH_MKL OR NOT WITH_AVX)
@@ -70,9 +66,9 @@ if(WITH_UNITY_BUILD)
 endif()

 register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-    sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+        sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})

-op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})

 if (WITH_GPU)
    # warpctc_op needs cudnn 7 above
@@ -86,9 +82,10 @@ if (WITH_GPU)
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
-op_library(lstm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} lstm_compute)
-op_library(eye_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
-op_library(recurrent_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+
+op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(eye_op DEPS ${OP_HEADER_DEPS})
+op_library(recurrent_op DEPS ${OP_HEADER_DEPS})

 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

@@ -163,5 +160,5 @@ if(WITH_UNITY_BUILD)
    # Using Unity Build to compile operators, `register_operator` will cause
    # the unity library to lose some symbols.
    # The specified link dependency needs to be displayed here.
-    target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} ${COMMON_OP_DEPS})
+    target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
 endif()
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
 include(operators)

 set(COLLECTIVE_DEPS "")
-if(WITH_GRPC)
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
-else()
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} ibverbs rdmacm)
-    endif()
-endif()

 set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")


--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be allreduced.");
+    AddOutput("Out", "(Tensor) the result of allreduced.");
+    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "sync_mode",
+        "(bool) whether to synchronize the CUDA stream after nccl call.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+***AllReduce Operator***
+
+Call NCCL AllReduce internally. Note that this op must be used when one
+thread is managing one GPU device.
+
+For speed reasons, reduce_type should be an integer:
+
+0: sum
+1: prod
+2: max
+3: min
+
+If input and output are the same variable, in-place allreduce will be used.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
+                             ops::AllReduceOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AllReduceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "AllReduce op can run on gpu place only for now."));
+#if defined(PADDLE_WITH_NCCL)
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    auto* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    auto* comm = dev_ctx.nccl_comm();
+    // FIXME(typhoonzero): should use nccl stream here.
+    auto stream = dev_ctx.stream();
+    PADDLE_ENFORCE_NOT_NULL(
+        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
+
+    int reduce_type = ctx.Attr<int>("reduce_type");
+    ncclRedOp_t red_type = ncclSum;
+    switch (reduce_type) {
+      case 0:
+        red_type = ncclSum;
+        break;
+      case 1:
+        red_type = ncclProd;
+        break;
+      case 2:
+        red_type = ncclMax;
+        break;
+      case 3:
+        red_type = ncclMin;
+        break;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
+        comm, stream));
+    if (ctx.Attr<bool>("sync_mode")) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/broadcast_op.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <ostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class BroadcastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of BroadcastOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Output) of ConvOp should not be null."));
+  }
+};
+
+class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be broadcast.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<bool>(
+        "sync_mode",
+        "(bool) whether to synchronize the CUDA stream after nccl call.")
+        .SetDefault(false);
+    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
+    AddComment(R"DOC(
+***Broadcast Operator***
+
+Call NCCL Broadcast internally. Note that this op must be used when one
+thread is managing one GPU device.
+)DOC");
+  }
+};
+
+template <typename T>
+class BroadcastOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
+                             ops::BroadcastOpMaker);
+
+REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
+                       ops::BroadcastOpKernel<double>,
+                       ops::BroadcastOpKernel<int>,
+                       ops::BroadcastOpKernel<int64_t>,
+                       ops::BroadcastOpKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet(
+            "The place of ExecutionContext should be CUDAPlace."));
+
+#if defined(PADDLE_WITH_NCCL)
+    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
+    int root_dev_id = ctx.Attr<int>("root");
+
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE_EQ(
+        out->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "Currently, the output of broadcast op must be initialized,"
+            "because this op can only be an In-Place operation."));
+    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        send_recv_buffer, in->data<void>(),
+        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
+                                             "only be an In-Place operation."));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto comm = dev_ctx.nccl_comm();
+    auto stream = dev_ctx.stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+        send_recv_buffer, static_cast<size_t>(in->numel()),
+        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
+
+    VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
+            << " From " << root_dev_id << " to " << dev_id;
+
+    if (ctx.Attr<bool>("sync_mode")) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
+                        ops::NCCLBroadcastOpKernel<double>,
+                        ops::NCCLBroadcastOpKernel<int>,
+                        ops::NCCLBroadcastOpKernel<int64_t>,
+                        ops::NCCLBroadcastOpKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
+return()

 if(WITH_GRPC)
    set(cc_generic_services "false")

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -28,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,10 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -26,10 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {

@@ -187,72 +183,7 @@ class NCEKernel : public framework::OpKernel<T> {
    // forward mul
    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));

-    // for remote prefetch
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-
-    if (remote_prefetch && !epmap.empty()) {
-      // if epmap is not empty, then the parameter will be fetched from remote
-      // parameter
-      // server
-
-      std::vector<int64_t> labels;
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        labels.push_back(sample_labels_data[i]);
-      }
-      std::set<T> st(labels.begin(), labels.end());
-      labels.assign(st.begin(), st.end());
-
-      framework::Scope &local_scope = context.scope().NewScope();
-
-      auto table_names = context.Attr<std::vector<std::string>>("table_names");
-
-      auto *ids = local_scope.Var("Ids@Prefetch");
-      auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
-      x_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
-          context.GetPlace());
-      // copy.
-      std::memcpy(x_tensor->data<int64_t>(), labels.data(),
-                  labels.size() * sizeof(int64_t));
-
-      std::vector<int> w_dims = paddle::framework::vectorize<int>(
-          context.Input<Tensor>("Weight")->dims());
-      w_dims[0] = static_cast<int>(labels.size());
-
-      auto *w_tensor = local_scope.Var("Weight@Prefetch")
-                           ->GetMutable<framework::LoDTensor>();
-      w_tensor->Resize(framework::make_ddim(w_dims));
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      auto weight = context.InputNames("Weight").front();
-      operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
-                                       weight, false, table_names, epmap,
-                                       context, local_scope);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!"));
-#endif
-
-      auto weight_mat = EigenMatrix<T>::From(
-          (local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        std::vector<int64_t>::iterator it =
-            std::find(labels.begin(), labels.end(), sample_labels_data[i]);
-        int idx = std::distance(labels.begin(), it);
-
-        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-             weight_mat.chip(idx, 0))
-                .sum();
-        sample_out_data[i] += result(0);
-        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
-      }
-      context.scope().DeleteScope(&local_scope);
-    } else {
-      auto weight_mat =
-          EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
@@ -261,7 +192,6 @@ class NCEKernel : public framework::OpKernel<T> {
      sample_out_data[i] += result(0);
      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
    }
-    }

    // forward cost
    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {

--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
+include(operators)
+
+set(DISTRIBUTE_DEPS "")
+
+list(APPEND DISTRIBUTE_DEPS fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy)
+
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS
+            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif()
+
+file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+list(REMOVE_DUPLICATES OPS)
+
+foreach (src ${OPS})
+    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endforeach ()
+
+register_operators()
+
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
+
+set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op)
+
+set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS})
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr int64_t kNoPadding = -1;
+
+class DistributedLookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Ids) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Outs) of LookupTableOp should not be null."));
+
+    auto ids_dims = ctx->GetInputsDim("Ids");
+    auto table_dims = ctx->GetInputDim("W");
+
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Only 2 dimensions of the 'Embedding' is supported."));
+
+    for (auto &ids_dim : ids_dims) {
+      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
+                        platform::errors::InvalidArgument(
+                            "The dimension of the 'Ids' tensor must be 2."));
+    }
+
+    // for fluid.embedding
+    auto lookup_table_version =
+        ctx->Attrs().Get<std::string>("lookup_table_version");
+
+    auto outputs_dims = std::vector<framework::DDim>();
+
+    for (auto &ids_dim : ids_dims) {
+      if (lookup_table_version == "lookup_table") {
+        outputs_dims.push_back(
+            framework::make_ddim({ids_dim[0], table_dims[1]}));
+      } else if (lookup_table_version == "lookup_table_v2") {
+        outputs_dims.push_back(framework::make_ddim(
+            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
+             static_cast<int64_t>(table_dims[1])}));
+      }
+    }
+
+    ctx->SetOutputsDim("Outputs", outputs_dims);
+    ctx->ShareLoD("Ids", /*->*/ "Outputs");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids",
+             "(LoDTensor) Ids's type should be LoDTensor"
+             "THe ids to be looked up in W.")
+        .AsDuplicable();
+
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+
+    AddOutput("Outputs",
+              "(LoDTensor) The lookup results, which have the same type as W.")
+        .AsDuplicable();
+
+    AddAttr<int>("table_id", "sparse table id").SetDefault(0);
+
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "lookup_table_version",
+        "(string, default lookup_table) "
+        "To distinguish between different versions of embedding OP")
+        .SetDefault(std::string("lookup_table"));
+
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+
+    AddComment(R"DOC(
+Lookup Tablel Prefetch Operator.
+This operator is used to perform lookup on parameter W,
+then concatenated into a sparse tensor.
+The type of Ids(Input) is SelectedRows, the rows of Ids contains
+the ids to be looked up in W;
+if the Id is not in the sparse table, this operator will return a
+random value and set the value into the table for the next looking up.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
+                  ops::DistributedLookupTableOpMaker);
+
+REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
+                       ops::DistributedLookupTableKernel<
+                           paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    distributed_lookup_table,
+    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DistributedLookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &scope = context.scope();
+
+    auto padding_idx = context.Attr<int64_t>("padding_idx");
+    auto table_id = context.Attr<int>("table_id");
+
+    auto embedding_name = context.InputNames("W").front();
+    int64_t emb_dim = 0;
+
+    auto *var = scope.FindVar(embedding_name);
+
+    if (var->IsType<framework::LoDTensor>()) {
+      emb_dim = var->Get<framework::LoDTensor>().dims()[1];
+    } else if (var->IsType<framework::SelectedRows>()) {
+      emb_dim = var->Get<framework::SelectedRows>().value().dims()[1];
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of `W` must be Tensor, SelectedRows.But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(var->Type())));
+    }
+
+    auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
+    auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
+
+    auto fleet = distributed::FleetWrapper::GetInstance();
+
+    if (platform::is_cpu_place(context.GetPlace())) {
+      fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
+                                    static_cast<uint64_t>(padding_idx),
+                                    context.GetPlace(), &inputs, &outputs);
+    } else {
+      auto inputs_variable = context.MultiInputVar("Ids");
+      auto outputs_variable = context.MultiOutputVar("Outputs");
+      auto inputs_name = context.InputNames("Ids");
+      auto outputs_name = context.OutputNames("Outputs");
+
+      auto cpu_place = platform::CPUPlace();
+      framework::Scope *tmp_scope = scope.NewTmpScope().release();
+
+      std::vector<const framework::LoDTensor *> tmp_input_vec;
+      auto input_var_size = inputs_variable.size();
+      std::vector<framework::LoDTensor *> tmp_output_vec;
+      auto output_var_size = outputs_variable.size();
+
+      // create temp input
+      for (size_t idx = 0; idx < input_var_size; ++idx) {
+        framework::Variable *tmp_input_var = tmp_scope->Var(inputs_name[idx]);
+        framework::LoDTensor *tmp_input_tensor =
+            tmp_input_var->GetMutable<framework::LoDTensor>();
+        framework::TensorCopy(inputs_variable[idx]->Get<framework::LoDTensor>(),
+                              cpu_place, context.device_context(),
+                              tmp_input_tensor);
+        tmp_input_vec.push_back(tmp_input_tensor);
+      }
+
+      // create temp output
+      for (size_t idx = 0; idx < output_var_size; ++idx) {
+        framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
+        framework::LoDTensor *tmp_output_tensor =
+            tmp_output_var->GetMutable<framework::LoDTensor>();
+        tmp_output_tensor->Resize(outputs[idx]->dims());
+        tmp_output_vec.push_back(tmp_output_tensor);
+      }
+
+      // use fleet->PullSparse
+      fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
+                                    static_cast<uint64_t>(padding_idx),
+                                    cpu_place, &tmp_input_vec, &tmp_output_vec);
+
+      // cp temp to origin
+      for (size_t idx = 0; idx < output_var_size; ++idx) {
+        framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
+        framework::LoDTensor *tmp_output_tensor =
+            tmp_output_var->GetMutable<framework::LoDTensor>();
+        framework::TensorCopy(
+            *tmp_output_tensor, context.GetPlace(), context.device_context(),
+            outputs_variable[idx]->GetMutable<framework::LoDTensor>());
+      }
+      delete tmp_scope;
+    }
+
+    auto id_names = context.InputNames("Ids");
+    auto out_names = context.OutputNames("Outputs");
+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
+
+    if (lookup_table_version == "lookup_table_v2") {
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto *id_var = scope.FindVar(id_names[i]);
+        auto *out_var = scope.FindVar(out_names[i]);
+        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class FakeInitInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FakeInitOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "fake init op's output only"
+          "supports SelectedRows and LoDTensor"));
+    }
+  }
+};
+
+class FakeInitOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
+};
+
+class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FakeInit Operator.
+Init an variable but not alloc memory for it, it is used for init the
+table parameter at trainer side in distributed lookup table.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::FakeInitOpVarTypeInference);
--- a/paddle/fluid/operators/pscore/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/fetch_barrier_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace distributed {
+class Communicator;
+}  // namespace distributed
+
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    VLOG(4) << "FetchBarrier Sync, do not need now";
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDispensable()
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fetch_barrier, ops::FetchBarrierOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+
+namespace paddle {
+namespace operators {
+
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+HeterListenAndServOp::HeterListenAndServOp(
+    const std::string &type, const framework::VariableNameMap &inputs,
+    const framework::VariableNameMap &outputs,
+    const framework::AttributeMap &attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
+
+HeterListenAndServOp::~HeterListenAndServOp() { Stop(); }
+
+void HeterListenAndServOp::Stop() {}
+
+void HeterListenAndServOp::RunAsyncLoop(framework::Executor *executor,
+                                        framework::ProgramDesc *program,
+                                        framework::Scope *recv_scope) const {
+  VLOG(2) << "RunAsyncLoop";
+  auto message_to_block_id_str =
+      Attr<std::vector<std::string>>("message_to_block_id");
+  DoubleFindMap<std::string, int32_t> message_to_block_id;
+
+  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
+                              const std::string &grad_and_id) {
+    std::vector<std::string> pieces;
+    split(grad_and_id, ':', &pieces);
+    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of message_and_id argument. "
+                          "Expected \"message:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The message name %s has already existed in out_map",
+                          pieces[0].c_str()));
+
+    int block_id = std::stoi(pieces[1]);
+    (*out_map)[pieces[0]] = block_id;
+  };
+
+  for (const auto &message_and_id : message_to_block_id_str) {
+    append_block_maps(&message_to_block_id, message_and_id);
+  }
+
+  size_t num_blocks = program->Size();
+  PADDLE_ENFORCE_GE(num_blocks, 1,
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 1. Recieved %zu",
+                        num_blocks));
+  std::vector<int> block_list;
+  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+    block_list.push_back(blkid);
+  }
+  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed, block id 1 in the program is global
+  // block if it's not bind to a grad var for it's update.
+  if (block_list[0] == 1 &&
+      message_to_block_id.find_value(static_cast<int32_t>(1)) ==
+          message_to_block_id.end()) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      message_to_prepared_ctx;
+  for (size_t i = 0; i < block_list.size(); ++i) {
+    auto blkid = block_list[i];
+    auto it = message_to_block_id.find_value(blkid);
+    if (it != message_to_block_id.end()) {
+      message_to_prepared_ctx[it->first] = optimize_prepared[i];
+    }
+  }
+
+  request_send_and_recv_handler_->SetGradToPreparedCtx(
+      &message_to_prepared_ctx);
+
+  for (size_t i = 0; i < block_list.size(); ++i) {
+    auto blkid = block_list[i];
+    auto it = message_to_block_id.find_value(blkid);
+    rpc_service_->RegisterServiceHandler(
+        it->first, [&](const MultiVarMsg *request, MultiVarMsg *response,
+                       brpc::Controller *cntl) -> int {
+          return request_send_and_recv_handler_->Handle(request, response,
+                                                        cntl);
+        });
+  }
+
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      rpc_service_->Stop();
+      VLOG(0) << "get exit. rpc_processor stop!";
+      break;
+    }
+    sleep(1);
+  }  // while(true)
+}
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
+  service->StartHeterService();
+}
+
+void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
+                                   const platform::Place &dev_place) const {
+  // Mark this as PS that it should decide profiling by listening from trainer.
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  VLOG(1) << "HeterListenAndServOp::RunImpl On gpu? "
+          << platform::is_gpu_place(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+
+  auto pserver_id = Attr<int>("pserver_id");
+  auto fan_in = Attr<int>("fanin");
+  auto inputs = Inputs("X");
+
+  PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
+                    platform::errors::PreconditionNotMet(
+                        "RPC service has been created unexpectedly."));
+  std::string endpoint = Attr<std::string>("endpoint");
+
+  VLOG(4) << "pserver_id: " << pserver_id << ", end_point:" << endpoint;
+
+  rpc_service_ = distributed::HeterServer::GetInstance();
+  rpc_service_->SetEndPoint(endpoint);
+  rpc_service_->SetFanin(fan_in);
+
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>("optimize_blocks");
+  PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
+                    platform::errors::PreconditionNotMet(
+                        "optimize blocks is less than 1. Optimize blocks "
+                        "should be 1 at least on the pserver side."));
+  auto *program = optimize_blocks[0]->Program();
+  framework::Executor executor(dev_place);
+
+  request_send_and_recv_handler_.reset(
+      new distributed::RequestSendAndRecvHandler());
+  request_send_and_recv_handler_->SetScope(&recv_scope);
+  request_send_and_recv_handler_->SetDevCtx(&dev_ctx);
+  request_send_and_recv_handler_->SetProgram(program);
+  request_send_and_recv_handler_->SetExecutor(&executor);
+
+  VLOG(2) << "RunAsyncLoop";
+  auto message_to_block_id_str =
+      Attr<std::vector<std::string>>("message_to_block_id");
+
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  VLOG(3) << "wait server thread to become ready...";
+  rpc_service_->WaitServerReady();
+  RunAsyncLoop(&executor, program, &recv_scope);
+  VLOG(3) << "Wait for Server_thread_ stop";
+  (server_thread_.get())->join();
+  VLOG(3) << "Server_thread_ stop";
+}
+
+class HeterListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
+    AddComment(
+        R"DOC(" + "HeterListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<int>("pserver_id",
+                 "(int, default -1), the parameter server index id")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(
+        "message_to_block_id",
+        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
+        "a map from message name to it's optimize block id")
+        .SetDefault({});
+    AddAttr<int>("distributed_mode",
+                 "indicate distriubte training mode, 0 is sync, 1 is "
+                 "fully-async, 2 is half-async, 3 is geo")
+        .SetDefault(0);
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        "optimize_blocks", "Optimize blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<int>("fanin", "How many clients send to this server.")
+        .SetDefault(1);
+    AddAttr<int>("rpc_exec_thread_num", "pserver send thread num.")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(heter_listen_and_serv, ops::HeterListenAndServOp,
+                  ops::HeterListenAndServOpMaker);
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+
+template <class TKey, class TValue>
+class DoubleFindMap : public std::unordered_map<TKey, TValue> {
+ public:
+  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
+    return std::find_if(this->begin(), this->end(),
+                        [&v](const std::pair<const std::string, int> p) {
+                          return p.second == v;
+                        });
+  }
+};
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service);
+
+class HeterListenAndServOp : public framework::OperatorBase {
+ public:
+  HeterListenAndServOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs);
+  virtual ~HeterListenAndServOp();
+
+  void RunAsyncLoop(framework::Executor* executor,
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
+
+  void Stop() override;
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override;
+
+ protected:
+  mutable std::shared_ptr<paddle::distributed::HeterServer> rpc_service_;
+  mutable std::shared_ptr<std::thread> server_thread_;
+  mutable std::shared_ptr<paddle::distributed::HeterRequestHandler>
+      request_send_and_recv_handler_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::distributed;
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+DECLARE_double(eager_delete_tensor_gb);
+
+USE_OP(scale);
+USE_NO_KERNEL_OP(heter_listen_and_serv);
+
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
+  framework::BlockDesc* block =
+      program->AppendBlock(*(program->MutableBlock(0)));
+
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto* out = block->Var("res");
+  out->SetType(framework::proto::VarType::LOD_TENSOR);
+  out->SetShape({1, 10});
+
+  return block;
+}
+
+void GetHeterListenAndServProgram(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+
+  auto* sub_block = AppendSendAndRecvBlock(program);
+  std::vector<framework::BlockDesc*> optimize_blocks;
+  optimize_blocks.push_back(sub_block);
+
+  std::vector<std::string> message_to_block_id = {"x:1"};
+  std::string endpoint = "127.0.0.1:19944";
+
+  framework::OpDesc* op = root_block->AppendOp();
+  op->SetType("heter_listen_and_serv");
+  op->SetInput("X", {});
+  op->SetAttr("message_to_block_id", message_to_block_id);
+  op->SetAttr("optimize_blocks", optimize_blocks);
+  op->SetAttr("endpoint", endpoint);
+  op->SetAttr("fanin", 1);
+  op->SetAttr("pserver_id", 0);
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
+
+  auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
+  float* res_ptr =
+      res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+}
+
+void StartHeterServer() {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+
+  LOG(INFO) << "before GetHeterListenAndServProgram";
+  GetHeterListenAndServProgram(&program);
+  auto prepared = exe.Prepare(program, 0);
+
+  LOG(INFO) << "before InitTensorsOnServer";
+  InitTensorsOnServer(&scope, &place, 10);
+
+  LOG(INFO) << "before RunPreparedContext";
+  exe.RunPreparedContext(prepared.get(), &scope, false);
+}
+
+TEST(HETER_LISTEN_AND_SERV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  std::string endpoint = "127.0.0.1:19944";
+  LOG(INFO) << "before StartSendAndRecvServer";
+  FLAGS_eager_delete_tensor_gb = -1;
+  std::thread server_thread(StartHeterServer);
+  sleep(1);
+
+  LOG(INFO) << "before HeterClient::GetInstance";
+  distributed::HeterClient* rpc_client =
+      distributed::HeterClient::GetInstance({endpoint}, 0).get();
+
+  PADDLE_ENFORCE_NE(rpc_client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  LOG(INFO) << "before InitTensorsOnClient";
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+  std::vector<std::string> send_var = {in_var_name};
+  std::vector<std::string> recv_var = {out_var_name};
+
+  LOG(INFO) << "before SendAndRecvAsync";
+  rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
+                               recv_var);
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  LOG(INFO) << "before CHECK";
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    LOG(INFO) << "ptr " << i << " is " << ptr[i];
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  LOG(INFO) << "end CHECK";
+  rpc_client->Stop();
+  LOG(INFO) << "end server Stop";
+  server_thread.join();
+  LOG(INFO) << "end server thread join";
+}
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::distributed;
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+
+USE_OP(scale);
+
+std::shared_ptr<distributed::HeterServer> b_rpc_service;
+
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto& out = *root_block->Var("res");
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
+  out.SetShape({1, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::LoDTensor>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::LoDTensor>();
+
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
+
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
+
+  auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
+  float* res_ptr =
+      res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
+  service->StartHeterService();
+}
+
+void StartSendAndRecvServer(std::string endpoint) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  LOG(INFO) << "before AppendSendAndRecvBlock";
+  auto block = AppendSendAndRecvBlock(&program);
+  std::string in_var_name("x");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+
+  LOG(INFO) << "before InitTensorsOnServer";
+  InitTensorsOnServer(&scope, &place, 10);
+  LOG(INFO) << "end InitTensorsOnServer";
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      message_to_prepared_ctx;
+  message_to_prepared_ctx[in_var_name] = prepared[0];
+
+  std::shared_ptr<distributed::RequestSendAndRecvHandler> b_req_handler;
+  b_req_handler.reset(new distributed::RequestSendAndRecvHandler());
+  LOG(INFO) << "before SetProgram";
+  b_req_handler->SetProgram(&program);
+  LOG(INFO) << "before SetGradToPreparedCtx";
+  b_req_handler->SetGradToPreparedCtx(&message_to_prepared_ctx);
+  LOG(INFO) << "before SetDevCtx";
+  b_req_handler->SetDevCtx(&ctx);
+  LOG(INFO) << "before SetScope";
+  b_req_handler->SetScope(&scope);
+  LOG(INFO) << "before SetExecutor";
+  b_req_handler->SetExecutor(&exe);
+  LOG(INFO) << "before HeterServer::GetInstance";
+  b_rpc_service = distributed::HeterServer::GetInstance();
+  b_rpc_service->SetEndPoint(endpoint);
+  LOG(INFO) << "before HeterServer::RegisterServiceHandler";
+  b_rpc_service->RegisterServiceHandler(
+      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
+                       brpc::Controller* cntl) -> int {
+        return b_req_handler->Handle(request, response, cntl);
+      });
+
+  LOG(INFO) << "before HeterServer::RunServer";
+  std::thread server_thread(std::bind(RunServer, b_rpc_service));
+
+  server_thread.join();
+}
+
+TEST(SENDANDRECV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  std::string endpoint = "127.0.0.1:4444";
+  LOG(INFO) << "before StartSendAndRecvServer";
+  b_rpc_service = distributed::HeterServer::GetInstance();
+  std::thread server_thread(StartSendAndRecvServer, endpoint);
+  b_rpc_service->WaitServerReady();
+
+  LOG(INFO) << "before HeterClient::GetInstance";
+  distributed::HeterClient* rpc_client =
+      distributed::HeterClient::GetInstance({endpoint}, 0).get();
+
+  PADDLE_ENFORCE_NE(rpc_client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  LOG(INFO) << "before InitTensorsOnClient";
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+  std::vector<std::string> send_var = {in_var_name};
+  std::vector<std::string> recv_var = {out_var_name};
+
+  LOG(INFO) << "before SendAndRecvAsync";
+  rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
+                               recv_var);
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  LOG(INFO) << "before CHECK";
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    LOG(INFO) << "ptr " << i << " is " << ptr[i];
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  LOG(INFO) << "end CHECK";
+  rpc_client->FinalizeWorker();
+  // b_rpc_service->Stop();
+  b_rpc_service->Stop();
+  LOG(INFO) << "end server Stop";
+  server_thread.join();
+  LOG(INFO) << "end server thread join";
+}
--- a/paddle/fluid/operators/pscore/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/listen_and_serv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
+constexpr char kCheckpointBlockId[] = "checkpint_block_id";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
+constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string& type,
+                  const framework::VariableNameMap& inputs,
+                  const framework::VariableNameMap& outputs,
+                  const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    VLOG(1) << "just for recorder";
+  }
+};
+
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string& ip) { return !ip.empty(); });
+    AddAttr<int>("pserver_id",
+                 "(int, default -1), the parameter server index id")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(
+        "grad_to_block_id",
+        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
+        "a map from grad name to it's optimize block id")
+        .SetDefault({});
+    AddAttr<int>("distributed_mode",
+                 "indicate distriubte training mode, 0 is sync, 1 is "
+                 "fully-async, 2 is half-async, 3 is geo")
+        .SetDefault(0);
+    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
+        .SetDefault(false);
+    AddAttr<std::vector<framework::BlockDesc*>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        kSparseGradToParam,
+        "sparse grad name to param name. like: 'emb@Grad:emb'")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "How many clients send to this server.")
+        .SetDefault(1);
+    AddAttr<int>(kCheckpointBlockId,
+                 "BolckID to run save checkpoint on pserer.")
+        .SetDefault(-1);
+    AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
+        .SetDefault(-1);
+    AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
+    AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
+        .SetDefault(1);
+    AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
+        .SetDefault(1);
+  }
+};
+
+class ListenAndServOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    listen_and_serv, ops::ListenAndServOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::ListenAndServOpMaker, ops::ListenAndServOpShapeInference);
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SendAndRecvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    auto message_name = ctx.Attr<std::string>("message_name");
+    auto send_var_name = ctx.Attr<std::vector<std::string>>("send_var_name");
+    auto recv_var_name = ctx.Attr<std::vector<std::string>>("recv_var_name");
+    auto epmap = ctx.Attr<std::vector<std::string>>("endpoints");
+    auto trainer_id = ctx.Attr<int>("trainer_id");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& context = *pool.Get(place);
+
+    distributed::HeterClient* rpc_client =
+        distributed::HeterClient::GetInstance(epmap, trainer_id).get();
+    VLOG(3) << "SendAndRecvOp message_name: " << message_name;
+    rpc_client->SendAndRecvAsync(epmap, context, scope, message_name,
+                                 send_var_name, recv_var_name);
+  }
+};
+
+class SendAndRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
+    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
+    AddAttr<std::string>("message_name", "");
+    AddAttr<std::vector<std::string>>("send_var_name", "Send Tensor's name");
+    AddAttr<std::vector<std::string>>("recv_var_name", "Recv Tensor's name");
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints", "Server endpoint")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+    SendAndRecv operator
+    This operator will send variables to listen_and_serve op at the parameter server.
+    And recv variable from parameter server of send variable's scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    send_and_recv,
+    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/pscore/send_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/send_barrier_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+
+namespace distributed {
+class Communicator;
+}  // namespace distributed
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    paddle::distributed::Communicator::GetInstance()->Barrier();
+  }
+};
+
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<bool>(
+        "half_async",
+        "(bool, default false)"
+        "half_async=True is for half_async mode, this will send signal "
+        "to HalfAsyncCommunicator Instance")
+        .SetDefault(false);
+  }
+};
+
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    send_barrier, ops::SendBarrierOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
--- a/paddle/fluid/operators/pscore/send_op.cc
+++ b/paddle/fluid/operators/pscore/send_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    // auto is_sparse = Attr<int>("is_sparse");
+    // auto table_id = Attr<int>("table_id");
+
+    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
+
+    auto* communicator = paddle::distributed::Communicator::GetInstance();
+    communicator->Check(send_varnames);
+    communicator->Send(ins, scope);
+
+    // auto fleet = paddle::distributed::FleetWrapper::GetInstance();
+    // if (is_sparse == 0) {
+    //   std::vector<::std::future<int32_t>> status;
+    //   fleet->PushDenseVarsAsync(scope, table_id, send_varnames, &status, 0,
+    //   -1);
+    // } else {
+    //   std::vector<::std::future<int32_t>> status;
+    //   fleet->PushSparseVarsAsync(scope, table_id, send_varnames[0], &status);
+    // }
+  }
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Send operator
+
+This operator will send variables to listen_and_serve op at the parameter server.
+)DOC");
+    AddAttr<int>("table_id", "table_id for send").SetDefault(0);
+    AddAttr<int>("is_sparse",
+                 "(int, default 0->Dense, 1->Sparse, 2->Distributed)")
+        .SetDefault(0);
+    AddAttr<std::vector<std::string>>(
+        "send_varnames",
+        "(vector<string>) "
+        "the split output varnames to send to pserver")
+        .SetDefault(std::vector<std::string>{});
+  }
+};
+
+class SendOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    send, ops::SendOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::SendOpMaker, ops::SendOpShapeInference);
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -20,10 +20,6 @@ if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()

-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_DEPS communicator)
-endif()
-
 set(PYBIND_SRCS
  pybind.cc
  exception.cc
@@ -54,7 +50,10 @@ if (WITH_CRYPTO)
 endif (WITH_CRYPTO)

 if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_SRCS communicator_py.cc)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+  set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  list(APPEND PYBIND_DEPS fleet communicator)
+  list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()

 if (WITH_NCCL)

--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include "paddle/fluid/pybind/fleet_py.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+
+namespace py = pybind11;
+using paddle::distributed::CommContext;
+using paddle::distributed::Communicator;
+using paddle::distributed::FleetWrapper;
+using paddle::distributed::HeterClient;
+
+namespace paddle {
+namespace pybind {
+void BindDistFleetWrapper(py::module* m) {
+  py::class_<FleetWrapper, std::shared_ptr<FleetWrapper>>(*m,
+                                                          "DistFleetWrapper")
+      .def(py::init([]() { return FleetWrapper::GetInstance(); }))
+      .def("load_sparse", &FleetWrapper::LoadSparseOnServer)
+      .def("init_server", &FleetWrapper::InitServer)
+      .def("run_server",
+           (uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer)
+      .def("run_server", (uint64_t (FleetWrapper::*)(          // NOLINT
+                             const std::string&, uint32_t)) &  // NOLINT
+                             FleetWrapper::RunServer)
+      .def("init_worker", &FleetWrapper::InitWorker)
+      .def("push_dense_params", &FleetWrapper::PushDenseParamSync)
+      .def("pull_dense_params", &FleetWrapper::PullDenseVarsSync)
+      .def("save_all_model", &FleetWrapper::SaveModel)
+      .def("save_one_model", &FleetWrapper::SaveModelOneTable)
+      .def("sparse_table_stat", &FleetWrapper::PrintTableStat)
+      .def("stop_server", &FleetWrapper::StopServer)
+      .def("stop_worker", &FleetWrapper::FinalizeWorker)
+      .def("barrier", &FleetWrapper::BarrierWithTable);
+}  // end BindDistFleetWrapper
+
+void BindPSHost(py::module* m) {
+  py::class_<distributed::PSHost>(*m, "PSHost")
+      .def(py::init<const std::string&, uint32_t, uint32_t>())
+      .def("serialize_to_string", &distributed::PSHost::serialize_to_string)
+      .def("parse_from_string", &distributed::PSHost::parse_from_string)
+      .def("to_uint64", &distributed::PSHost::serialize_to_uint64)
+      .def("from_uint64", &distributed::PSHost::parse_from_uint64)
+      .def("to_string", &distributed::PSHost::to_string);
+}
+
+void BindCommunicatorContext(py::module* m) {
+  py::class_<CommContext>(*m, "CommContext")
+      .def(
+          py::init<const std::string&, const std::vector<std::string>&,
+                   const std::vector<std::string>&, const std::vector<int64_t>&,
+                   const std::vector<std::string>&, int, bool, bool, bool,
+                   int>())
+      .def("var_name", [](const CommContext& self) { return self.var_name; })
+      .def("trainer_id",
+           [](const CommContext& self) { return self.trainer_id; })
+      .def("table_id", [](const CommContext& self) { return self.table_id; })
+      .def("split_varnames",
+           [](const CommContext& self) { return self.splited_varnames; })
+      .def("split_endpoints",
+           [](const CommContext& self) { return self.epmap; })
+      .def("sections",
+           [](const CommContext& self) { return self.height_sections; })
+      .def("aggregate", [](const CommContext& self) { return self.merge_add; })
+      .def("is_sparse", [](const CommContext& self) { return self.is_sparse; })
+      .def("is_distributed",
+           [](const CommContext& self) { return self.is_distributed; })
+      .def("origin_varnames",
+           [](const CommContext& self) { return self.origin_varnames; })
+      .def("__str__", [](const CommContext& self) { return self.print(); });
+}
+
+using paddle::distributed::AsyncCommunicator;
+using paddle::distributed::GeoCommunicator;
+using paddle::distributed::RecvCtxMap;
+using paddle::distributed::RpcCtxMap;
+using paddle::distributed::SyncCommunicator;
+using paddle::framework::Scope;
+
+void BindDistCommunicator(py::module* m) {
+  // Communicator is already used by nccl, change to DistCommunicator
+  py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
+                                                          "DistCommunicator")
+      .def(py::init([](const std::string& mode, const std::string& dist_desc,
+                       const std::vector<std::string>& host_sign_list,
+                       const RpcCtxMap& send_ctx, const RecvCtxMap& recv_ctx,
+                       Scope* param_scope,
+                       std::map<std::string, std::string>& envs) {
+        if (mode == "ASYNC") {
+          Communicator::InitInstance<AsyncCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else if (mode == "SYNC") {
+          Communicator::InitInstance<SyncCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else if (mode == "GEO") {
+          Communicator::InitInstance<GeoCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "unsuported communicator MODE"));
+        }
+        return Communicator::GetInstantcePtr();
+      }))
+      .def("stop", &Communicator::Stop)
+      .def("start", &Communicator::Start)
+      .def("push_sparse_param", &Communicator::RpcSendSparseParam)
+      .def("is_running", &Communicator::IsRunning)
+      .def("init_params", &Communicator::InitParams);
+  //  .def("recv", &Communicator::RecvNoBarrier);
+}
+
+void BindHeterClient(py::module* m) {
+  py::class_<HeterClient, std::shared_ptr<HeterClient>>(*m, "HeterClient")
+      .def(py::init(
+          [](const std::vector<std::string>& endpoint, const int& trainer_id) {
+            return HeterClient::GetInstance(endpoint, trainer_id);
+          }))
+      .def("stop", &HeterClient::Stop);
+}
+
+}  // end namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindDistFleetWrapper(py::module* m);
+void BindPSHost(py::module* m);
+void BindCommunicatorContext(py::module* m);
+void BindDistCommunicator(py::module* m);
+void BindHeterClient(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -103,14 +103,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/xpu_info.h"
 #endif

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/pybind/communicator_py.h"
-#endif
-
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif

+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+
 #include "pybind11/stl.h"

 DECLARE_bool(use_mkldnn);
@@ -2837,10 +2837,13 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CRYPTO
  BindCrypto(&m);
 #endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
-  BindCommunicator(&m);
+  BindDistFleetWrapper(&m);
+  BindPSHost(&m);
  BindCommunicatorContext(&m);
-  BindLargeScaleKV(&m);
+  BindDistCommunicator(&m);
+  BindHeterClient(&m);
 #endif
 }
 }  // namespace pybind

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -212,7 +212,7 @@ function cmake_base() {
    fi

    if [ "$SYSTEM" == "Darwin" ]; then
-        WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
+        WITH_DISTRIBUTE="OFF"
        WITH_AVX=${WITH_AVX:-ON}
        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
    else
@@ -220,13 +220,8 @@ function cmake_base() {
    fi

    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
-
-    if [ "$SYSTEM" == "Darwin" ]; then
-        gloo_flag="OFF"
-    else
+    grpc_flag="OFF"
    gloo_flag=${distibuted_flag}
-    fi

    cat <<EOF
    ========================================

--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from ..runtime.collective_runtime import CollectiveRuntime
 from ..runtime.parameter_server_runtime import ParameterServerRuntime
+from ..runtime.the_one_ps import TheOnePSRuntime


 class RuntimeFactory(object):
@@ -26,7 +27,8 @@ class RuntimeFactory(object):
            return collective_runtime

        k_steps = context["valid_strategy"].a_sync_configs["k_steps"]
+
        if not context["role_maker"]._is_collective and k_steps >= 0:
-            ps_runtime = ParameterServerRuntime()
+            ps_runtime = TheOnePSRuntime()
            ps_runtime._set_basic_info(context)
            return ps_runtime
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -72,7 +72,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
-            _startup = worker.init_from_server_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(_startup,
                                                         compiled_config)

@@ -106,19 +105,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
-            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
-            ):
-                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            # ):
+            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup

    def _build_pserver_programs(self, compiled_config):
-        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
-
        _main = fluid.Program()
        _startup = fluid.Program()

+        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
+
        if not compiled_config.is_geo_mode():
+
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+            is_sgd_adam = False
+
+            main_program = compiled_config.get_origin_main_program()
+            ops = _get_optimize_ops(main_program)
+
+            if len(ops) == 0:
+                return _main, _startup
+
+            for op in ops:
+                if op.type in ["sgd", "adam"]:
+                    is_sgd_adam = True
+                    break
+
+            if is_sgd_adam:
+                return _main, _startup
+
            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_optimizer_pass(_main, compiled_config)
@@ -139,12 +156,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_geo_optimizer_pass(_main, compiled_config)
-            _main = server.large_scale_sparse_pass(_main, _main,
-                                                   compiled_config, False)
            _startup = server.build_pserver_startup_program_pass(
                _startup, _main, compiled_config)
-            _startup = server.large_scale_sparse_pass(_startup, _main,
-                                                      compiled_config, True)
            _startup = server.delete_unused_in_startup_pass(_startup, _main,
                                                            compiled_config)


--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -17,10 +17,10 @@ import paddle.fluid as fluid
 import math
 import numpy as np
 from paddle.fluid.framework import Variable
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+import paddle.distributed.fleet as fleet


-def sum(input, scope=None):
+def sum(input, scope=None, util=None):
    """
    distributed sum in fleet

@@ -45,21 +45,22 @@ def sum(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("sum array: ", paddle.distributed.fleet.sum(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="sum")
+    output = util.all_reduce(input, "sum")
    output = output.reshape(old_shape)
    return output


-def max(input, scope=None):
+def max(input, scope=None, util=None):
    """
    distributed max in fleet

@@ -84,21 +85,22 @@ def max(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("max array: ", paddle.distributed.fleet.max(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="max")
+    output = util.all_reduce(input, "max")
    output = output.reshape(old_shape)
    return output


-def min(input, scope=None):
+def min(input, scope=None, util=None):
    """
    distributed min in fleet

@@ -123,21 +125,22 @@ def min(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("min array: ", paddle.distributed.fleet.min(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="min")
+    output = util.all_reduce(input, "min")
    output = output.reshape(old_shape)
    return output


-def auc(stat_pos, stat_neg, scope=None):
+def auc(stat_pos, stat_neg, scope=None, util=None):
    """
    distributed auc in fleet

@@ -164,9 +167,11 @@ def auc(stat_pos, stat_neg, scope=None):
          neg = np.array(scope.find_var(stat_neg.name).get_tensor())
          print("auc: ", paddle.distributed.fleet.auc(pos, neg))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(stat_pos, Variable):
        stat_pos = np.array(scope.find_var(stat_pos.name).get_tensor())
    elif isinstance(stat_pos, str):
@@ -181,15 +186,14 @@ def auc(stat_pos, stat_neg, scope=None):
    stat_pos = stat_pos.reshape(-1)
    global_pos = np.copy(stat_pos) * 0
    # mpi allreduce
-    fleet._role_maker._all_reduce(stat_pos, global_pos)
-    # reshape to its original shape
+    global_pos = util.all_reduce(stat_pos, "sum")
    global_pos = global_pos.reshape(old_pos_shape)

    # auc neg bucket
    old_neg_shape = np.array(stat_neg.shape)
    stat_neg = stat_neg.reshape(-1)
    global_neg = np.copy(stat_neg) * 0
-    fleet._role_maker._all_reduce(stat_neg, global_neg)
+    global_neg = util.all_reduce(stat_neg, "sum")
    global_neg = global_neg.reshape(old_neg_shape)

    # calculate auc
@@ -216,11 +220,10 @@ def auc(stat_pos, stat_neg, scope=None):
    else:
        auc_value = area / (pos * neg)

-    fleet._role_maker._barrier_worker()
    return auc_value


-def mae(abserr, total_ins_num, scope=None):
+def mae(abserr, total_ins_num, scope=None, util=None):
    """
    distributed mae in fleet

@@ -242,23 +245,28 @@ def mae(abserr, total_ins_num, scope=None):
          res = np.array(scope.find_var(abserr.name).get_tensor())
          print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(abserr, Variable):
        abserr = np.array(scope.find_var(abserr.name).get_tensor())
    elif isinstance(abserr, str):
        abserr = np.array(scope.find_var(abserr).get_tensor())
+
    old_metric_shape = np.array(abserr.shape)
    abserr = abserr.reshape(-1)
    global_metric = np.copy(abserr) * 0
-    fleet._role_maker._all_reduce(abserr, global_metric)
+
+    global_metric = util.all_reduce(abserr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    mae_value = global_metric[0] / total_ins_num
    return mae_value


-def rmse(sqrerr, total_ins_num, scope=None):
+def rmse(sqrerr, total_ins_num, scope=None, util=None):
    """
    distributed rmse in fleet

@@ -280,9 +288,11 @@ def rmse(sqrerr, total_ins_num, scope=None):
          res = np.array(scope.find_var(sqrerr.name).get_tensor())
          print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(sqrerr, Variable):
        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
    elif isinstance(sqrerr, str):
@@ -290,13 +300,15 @@ def rmse(sqrerr, total_ins_num, scope=None):
    old_metric_shape = np.array(sqrerr.shape)
    sqrerr = sqrerr.reshape(-1)
    global_metric = np.copy(sqrerr) * 0
-    fleet._role_maker._all_reduce(sqrerr, global_metric)
+
+    global_metric = util.all_reduce(sqrerr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    rmse_value = math.sqrt(global_metric[0] / total_ins_num)
    return rmse_value


-def mse(sqrerr, total_ins_num, scope=None):
+def mse(sqrerr, total_ins_num, scope=None, util=None):
    """
    distributed mse in fleet

@@ -318,9 +330,11 @@ def mse(sqrerr, total_ins_num, scope=None):
          metric = np.array(scope.find_var(sqrerr.name).get_tensor())
          print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(sqrerr, Variable):
        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
    elif isinstance(sqrerr, str):
@@ -328,13 +342,15 @@ def mse(sqrerr, total_ins_num, scope=None):
    old_metric_shape = np.array(sqrerr.shape)
    sqrerr = sqrerr.reshape(-1)
    global_metric = np.copy(sqrerr) * 0
-    fleet._role_maker._all_reduce(sqrerr, global_metric)
+
+    global_metric = util.all_reduce(sqrerr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    mse_value = global_metric[0] / total_ins_num
    return mse_value


-def acc(correct, total, scope=None):
+def acc(correct, total, scope=None, util=None):
    """
    distributed accuracy in fleet

@@ -367,9 +383,11 @@ def acc(correct, total, scope=None):
          total_num = np.array(scope.find_var(total.name).get_tensor())
          print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(correct, Variable):
        correct = np.array(scope.find_var(correct.name).get_tensor())
    elif isinstance(correct, str):
@@ -378,8 +396,11 @@ def acc(correct, total, scope=None):
        total = np.array(scope.find_var(total.name).get_tensor())
    elif isinstance(total, str):
        total = np.array(scope.find_var(total).get_tensor())
+
    global_correct_num = np.copy(correct) * 0
    global_total_num = np.copy(total) * 0
-    fleet._role_maker._all_reduce(correct, global_correct_num)
-    fleet._role_maker._all_reduce(total, global_total_num)
+
+    global_correct_num = util.all_reduce(correct, "sum")
+    global_total_num = util.all_reduce(total, "sum")
+
    return float(global_correct_num[0]) / float(global_total_num[0])
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,3 +14,4 @@

 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
+from .the_one_ps import TheOnePSRuntime
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.

 from .fs import LocalFS, HDFSClient
+from .ps_util import Distributed
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -216,25 +216,6 @@ def __bootstrap__():
        read_env_flags.append('tracer_mkldnn_ops_on')
        read_env_flags.append('tracer_mkldnn_ops_off')

-    if core.is_compiled_with_dist():
-        #env for rpc
-        read_env_flags.append('rpc_deadline')
-        read_env_flags.append('rpc_retry_times')
-        read_env_flags.append('rpc_server_profile_path')
-        read_env_flags.append('enable_rpc_profiler')
-        read_env_flags.append('rpc_send_thread_num')
-        read_env_flags.append('rpc_get_thread_num')
-        read_env_flags.append('rpc_prefetch_thread_num')
-        read_env_flags.append('rpc_disable_reuse_port')
-        read_env_flags.append('rpc_retry_bind_port')
-
-        read_env_flags.append('worker_update_interval_secs')
-
-        if core.is_compiled_with_brpc():
-            read_env_flags.append('max_body_size')
-            #set brpc max body size
-            os.environ['FLAGS_max_body_size'] = "2147483647"
-
    if core.is_compiled_with_cuda():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use',

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1365,7 +1365,8 @@ class Variable(object):
        if self.type == core.VarDesc.VarType.SELECTED_ROWS or self.type == core.VarDesc.VarType.LOD_TENSOR:
            dtype_str = str(self.dtype).split('.')[1]
            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".\
-                format(name=self.name, type=type_str, shape=self.shape, dtype=dtype_str, stop_gradient=self.stop_gradient)
+                format(name=self.name, type=type_str, shape=self.shape,
+                       dtype=dtype_str, stop_gradient=self.stop_gradient)
        else:
            var_str = "{name} : {type})".\
                format(name=self.name, type=type_str)
@@ -2013,7 +2014,8 @@ class Operator(object):
        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
-        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue'
+        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue',
+        'heter_listen_and_serv'
    }

    def __init__(self,
@@ -2284,7 +2286,8 @@ class Operator(object):

        if outputs_str != "{}":
            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
-                format(outputs = outputs_str, op_type=self.type, inputs=inputs_str, attrs=attrs_str)
+                format(outputs=outputs_str, op_type=self.type,
+                       inputs=inputs_str, attrs=attrs_str)
        else:
            op_str = "{op_type}(inputs={inputs}, {attrs})".\
                format(op_type=self.type, inputs=inputs_str, attrs=attrs_str)
@@ -3967,8 +3970,9 @@ class IrGraph(object):

        def _convert_to_pdf(dot_file_path):
            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
-                                          + ' -o ' + pdf_save_path, shell=True)
+            exited_code = subprocess.call(
+                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
+                shell=True)
            if exited_code != 0:
                print('The dot command is needed for creating pdf files.')
                print('The {} is saved as the dot filetype.'.format(
@@ -4581,7 +4585,7 @@ class Program(object):
            The two code snippets above will generate and print same programs.
        """

-        #NOTE(zhiqiu): we sync the original program first, since its program may diff with
+        # NOTE(zhiqiu): we sync the original program first, since its program may diff with
        # its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
        self._sync_with_cpp()

@@ -4611,7 +4615,7 @@ class Program(object):
            if hasattr(self, 'lr_sheduler'):
                p.lr_sheduler = self.lr_sheduler

-            #NOTE(zhiqiu): we sync the cloned program, to update its program by
+            # NOTE(zhiqiu): we sync the cloned program, to update its program by
            # its desc.
            p._sync_with_cpp()

@@ -4656,7 +4660,7 @@ class Program(object):
            Program:  A new, pruned program.
        """

-        #NOTE(zhiqiu): we sync the original program first, since its program may diff with
+        # NOTE(zhiqiu): we sync the original program first, since its program may diff with
        # its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
        self._sync_with_cpp()


--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -20,6 +20,9 @@ set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES}
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

+# for coverage
+LIST(REMOVE_ITEM TEST_OPS test_custom_op)
+
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
--- a/python/paddle/fluid/tests/unittests/test_desc_clone_dist.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone_dist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
--- a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
--- a/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_metric.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_split_op.py
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_program_code_dist.py
+++ b/python/paddle/fluid/tests/unittests/test_program_code_dist.py
--- a/python/paddle/fluid/tests/unittests/test_recv_save_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_save_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py