[Feature] one ps (3/4) (#29604)

* oneps (3/4) Co-authored-by: N MrChengmo <cmchengmo@163.com> Co-authored-by: N malin10 <malin10@baidu.com> Co-authored-by: N chengmo <chengmo@baidu.com>

[Feature] one ps (3/4) (#29604)
* oneps (3/4) Co-authored-by: N MrChengmo <cmchengmo@163.com> Co-authored-by: N malin10 <malin10@baidu.com> Co-authored-by: N chengmo <chengmo@baidu.com>
032414ca · tangwei12 · GitHub · edc06c6a · 032414ca · 032414ca
121 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,17 +246,6 @@ endif()

 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies

-if(WITH_DISTRIBUTE)
-    if(WITH_GRPC)
-        message(STATUS "Use grpc framework.")
-        include(external/grpc)
-    else()
-        message(STATUS "Use brpc framework.")
-        include(external/leveldb)
-        include(external/brpc)
-    endif()
-endif()
-
 include(flags)              # set paddle compile flags

 if(WITH_PROFILER)

--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -33,15 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})

 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")

 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
        extern_brpc
        ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  "${GIT_URL}/apache/incubator-brpc.git"
-    GIT_TAG         "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now.
+        # TODO(gongwb): change to de newst repo when they changed.
+        GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
+        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
        PREFIX          ${BRPC_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -63,9 +63,13 @@ ExternalProject_Add(
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
+# ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)

 add_definitions(-DBRPC_WITH_GLOG)
+
+LIST(APPEND external_project_dependencies brpc)
+
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -23,10 +23,10 @@ INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
 ExternalProject_Add(
        extern_leveldb
        ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
        PREFIX ${LEVELDB_SOURCES_DIR}
-    GIT_REPOSITORY "${GIT_URL}/google/leveldb.git"
+        GIT_REPOSITORY "https://github.com/google/leveldb"
        GIT_TAG v1.18
+        UPDATE_COMMAND ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
        INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/
@@ -35,6 +35,11 @@ ExternalProject_Add(
        BUILD_IN_SOURCE 1
 )

+ADD_DEPENDENCIES(extern_leveldb snappy)
+
 ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
 ADD_DEPENDENCIES(leveldb extern_leveldb)
+
+LIST(APPEND external_project_dependencies leveldb)
+
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include (ExternalProject)
+
+# NOTE: snappy is needed when linking with recordio
+
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
+ExternalProject_Add(
+        extern_snappy
+        GIT_REPOSITORY "https://github.com/google/snappy"
+        GIT_TAG "1.1.7"
+        PREFIX          ${SNAPPY_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+        -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
+        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DBUILD_TESTING=OFF
+        -DSNAPPY_BUILD_TESTS:BOOL=OFF
+        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+        add_custom_command(TARGET extern_snappy POST_BUILD
+                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
+                )
+    ENDIF()
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
+
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
+
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
+
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -95,7 +95,7 @@ include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
 if(NOT APPLE)
  find_package(Threads REQUIRED)
  link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  if(WITH_PSLIB)
+  if(WITH_PSLIB OR WITH_DISTRIBUTE)
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
  else()
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -233,7 +233,7 @@ if(WITH_PYTHON)
    list(APPEND third_party_deps extern_pybind)
 endif()

-IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
+IF(WITH_TESTING OR WITH_DISTRIBUTE)
    include(external/gtest)     # download, build, install gtest
    list(APPEND third_party_deps extern_gtest)
 ENDIF()
@@ -275,14 +275,18 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

-if(WITH_DISTRIBUTE)
+if (WITH_DISTRIBUTE)
+    include(external/snappy)
+    list(APPEND third_party_deps extern_snappy)

-    if(WITH_GRPC)
-        list(APPEND third_party_deps extern_grpc)
-    else()
+    include(external/leveldb)
    list(APPEND third_party_deps extern_leveldb)
+        
+    include(external/brpc)
    list(APPEND third_party_deps extern_brpc)
-    endif()
+
+    include(external/libmct)     # download, build, install libmct
+    list(APPEND third_party_deps extern_libmct)
 endif()

 if(WITH_XBYAK)

--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -14,14 +14,9 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()

-
 add_subdirectory(table)
-add_subdirectory(test)
-
-# open it until CI support brpc
-return()
-
 add_subdirectory(service)
+add_subdirectory(test)

 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)


--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -35,6 +35,6 @@ cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
 cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
 cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS})

-cc_library(brpc_utils SRCS brpc_utils.cc DEPS ${COMMON_DEPS} ${RPC_DEPS})
+cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -741,7 +741,7 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
      request_call_num, [shard_sorted_kvs, value_size](void *done) {
        int ret = 0;
        auto *closure = (DownpourBrpcClosure *)done;
-        for (size_t i = 0; i < ids.size(); ++i) {
+        for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
          if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
            ret = -1;
            break;

--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -839,7 +839,7 @@ void GeoCommunicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {

  for (auto &iter : send_varname_to_ctx_) {
    auto &ctx = iter.second;
-    if (!ctx.is_sparse) return;
+    if (!ctx.is_sparse) continue;
    auto &varname = ctx.origin_varnames[0];
    auto &table_id = ctx.table_id;
    auto param = varname.substr(0, varname.size() - 5);
@@ -853,12 +853,12 @@ void GeoCommunicator::InitDense(std::vector<std::string> &varnames,
  if (trainer_id_ == 0) {
    RpcSendDenseParam(varnames, table_id, *recv_scope_);
    BarrierWithTable(1);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "push dense param to table " << table_id
            << " from 0' trainer done";
  } else {
    BarrierWithTable(1);
    RpcRecvDense(varnames, table_id, recv_scope_);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "pull dense param to table " << table_id
            << " from 0' trainer done";
  }

@@ -952,20 +952,20 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
 }

 void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) {
-  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
+  VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
  if (trainer_id_ == 0) {
    RpcSendSparseParam(var_name, table_id, *recv_scope_);
    BarrierWithTable(1);
-    VLOG(0) << "push sparse param to table " << table_id
+    VLOG(1) << "push sparse param to table " << table_id
            << " from 0' trainer done";
  } else {
    BarrierWithTable(1);
    RpcRecvSparse(var_name, table_id, recv_scope_);
-    VLOG(0) << "push dense param to table " << table_id
+    VLOG(1) << "pull sparse param to table " << table_id
            << " from 0' trainer done";
  }

-  VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " done.";
+  VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " done.";
  auto *global_var = recv_scope_->FindVar(var_name);
  auto *var = old_scope_->Var(var_name);
  framework::CopyVariable(*global_var, var);

--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -24,11 +24,11 @@
 #include "paddle/fluid/platform/timer.h"

 DECLARE_int32(rpc_deadline);
+DECLARE_int32(pserver_timeout_ms);
+
 namespace paddle {
 namespace distributed {

-DEFINE_int32(pserver_timeout_ms, 10800000, "pserver request server timeout_ms");
-
 std::shared_ptr<HeterClient> HeterClient::s_instance_ = NULL;
 bool HeterClient::is_initialized_ = false;

@@ -53,6 +53,23 @@ void HeterClient::Stop() {
  }
 }

+void HeterClient::FinalizeWorker() {
+  running_ = false;
+  if (!is_initialized_) {
+    VLOG(0) << "HeterClient is not inited, do nothing";
+  } else {
+    if (main_thread_) {
+      main_thread_->join();
+      main_thread_.reset(nullptr);
+    }
+    VLOG(1) << "HeterClient Stop Done";
+  }
+}
+
+std::future<int32_t> HeterClient::StopHeterWorker() {
+  return SendCmd(-1, PS_STOP_SERVER, {});
+}
+
 void HeterClient::RpcProfilerControl() {
  if (trainer_id_ == 0) {
    if (!do_server_profiler_ && platform::IsProfileEnabled()) {
@@ -73,7 +90,7 @@ void HeterClient::CreateClient2XpuConnection() {
  brpc::ChannelOptions options;
  options.protocol = "baidu_std";
  options.connection_type = "single";
-  options.timeout_ms = pserver_timeout_ms;
+  options.timeout_ms = FLAGS_pserver_timeout_ms;

  xpu_channels_.resize(xpu_list_.size());
  for (size_t i = 0; i < xpu_list_.size(); ++i) {
@@ -102,7 +119,7 @@ void HeterClient::SendAndRecvAsync(
  int num = trainer_id_ % xpu_channels_.size();

  brpc::Controller cntl;
-  cntl.set_timeout_ms(pserver_timeout_ms);
+  cntl.set_timeout_ms(FLAGS_pserver_timeout_ms);
  distributed::MultiVarMsg request, response;
  auto& request_io_buffer = cntl.request_attachment();
  ::paddle::PsService_Stub stub(xpu_channels_[num].get());
@@ -149,7 +166,7 @@ std::future<int32_t> HeterClient::SendCmd(
    }
    ::paddle::PsService_Stub rpc_stub(xpu_channels_[i].get());
    closure->cntl(i)->set_timeout_ms(
-        pserver_timeout_ms);  // cmd msg don't limit timeout for save/load
+        FLAGS_pserver_timeout_ms);  // cmd msg don't limit timeout for save/load
    rpc_stub.service(closure->cntl(i), closure->request(i),
                     closure->response(i), closure);
  }

--- a/paddle/fluid/distributed/service/heter_client.h
+++ b/paddle/fluid/distributed/service/heter_client.h
@@ -42,7 +42,7 @@ typedef std::function<void(void*)> HeterRpcCallbackFunc;

 class OnHeterRpcDone : public google::protobuf::Closure {
 public:
-  OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
+  explicit OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
  virtual ~OnHeterRpcDone() {}
  void Run() {
    std::unique_ptr<OnHeterRpcDone> self_guard(this);
@@ -79,7 +79,6 @@ class HeterClient {
    if (NULL == s_instance_) {
      is_initialized_ = true;
      s_instance_.reset(new paddle::distributed::HeterClient());
-      std::vector<std::string> xpu_list = {endpoint};
      s_instance_->SetXpuList(endpoint);
      s_instance_->SetTrainerID(trainer_id);
      s_instance_->CreateClient2XpuConnection();
@@ -89,6 +88,8 @@ class HeterClient {

  void Stop();

+  void FinalizeWorker();
+
  void MainThread();

  void RpcProfilerControl();
@@ -97,6 +98,7 @@ class HeterClient {
                               const std::vector<std::string>& params);

  std::future<int32_t> StartProfiler();
+
  std::future<int32_t> StopProfiler();
  std::future<int32_t> StopHeterWorker();

@@ -104,17 +106,16 @@ class HeterClient {

  void SetXpuList(const std::vector<std::string>& xpu_list) {
    xpu_list_ = xpu_list;
-  };
+  }

  void SetTrainerID(const int& trainer_id) { trainer_id_ = trainer_id; }

 private:
  static std::shared_ptr<HeterClient> s_instance_;
-
- protected:
  static bool is_initialized_;
  std::unique_ptr<std::thread> main_thread_{nullptr};
  std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
+
  DISABLE_COPY_AND_ASSIGN(HeterClient);
  std::vector<std::string> xpu_list_;


--- a/paddle/fluid/distributed/service/heter_server.cc
+++ b/paddle/fluid/distributed/service/heter_server.cc
@@ -45,7 +45,11 @@ void HeterServer::StartHeterService() {
  }
  condition_ready_.notify_all();

-  server_.Join();
+  std::unique_lock<std::mutex> running_lock(mutex_);
+  cv_.wait(running_lock, [&] {
+    VLOG(1) << "Heter Server is Stop? " << stoped_;
+    return stoped_;
+  });
 }

 void HeterServer::SetEndPoint(std::string& endpoint) {
@@ -83,6 +87,7 @@ int32_t HeterService::stop_heter_worker(const PsRequestMessage& request,
  stop_cpu_worker_set_.insert(client_id);
  if (stop_cpu_worker_set_.size() == fan_in_) {
    is_exit_ = true;
+    VLOG(0) << "Stop heter Service done.";
  }
  return 0;
 }

--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "brpc/channel.h"
 #include "brpc/controller.h"
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/profiler.h"

+DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {
 namespace distributed {

@@ -82,7 +84,7 @@ class HeterService : public ::paddle::PsService {
      response->set_err_code(service_ret);
      response->set_err_msg("server internal error");
    }
-  };
+  }

  void SendAndRecvVariable(::google::protobuf::RpcController* controller,
                           const MultiVarMsg* request, MultiVarMsg* response,
@@ -134,6 +136,10 @@ class HeterServer {
  virtual ~HeterServer() {}

  void Stop() {
+    VLOG(0) << "HeterServer Stop()";
+    std::unique_lock<std::mutex> lock(mutex_);
+    stoped_ = true;
+    cv_.notify_all();
    server_.Stop(1000);
    server_.Join();
  }
@@ -162,6 +168,10 @@ class HeterServer {

 private:
  static std::shared_ptr<HeterServer> s_instance_;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  std::condition_variable condition_ready_;
+  bool stoped_ = false;
  std::string endpoint_;

 protected:
@@ -169,7 +179,7 @@ class HeterServer {
  HeterService service_;
  DISABLE_COPY_AND_ASSIGN(HeterServer);
  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
+
  int ready_;
 };

@@ -215,6 +225,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler {
  int Handle(const MultiVarMsg* request, MultiVarMsg* response,
             brpc::Controller* cntl) override {
    platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle");
+    FLAGS_eager_delete_tensor_gb = -1;
    auto& local_scope = scope_->NewScope();
    auto message_name = request->message_name();
    auto& request_io_buffer = cntl->request_attachment();

--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -60,6 +60,8 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
  _environment = &env;
  _shuffled_ins =
      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
+  size_t shard_num = env.get_ps_servers().size();
+
  const auto &downpour_param = _config.downpour_server_param();

  uint32_t barrier_table = UINT32_MAX;
@@ -72,6 +74,7 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
        "BarrierTable") {
      barrier_table = downpour_param.downpour_table_param(i).table_id();
    }
+    table->set_shard(_rank, shard_num);
    table->initialize(downpour_param.downpour_table_param(i),
                      config.fs_client_param());
    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);

--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -12,8 +12,7 @@ cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
-cc_library(tensor_table SRCS tensor_table.cc DEPS ps_framework_proto proto_desc enforce executor tensor device_context simple_threadpool gflags glog )

 set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

-cc_library(table SRCS table.cc DEPS common_table tensor_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
+cc_library(table SRCS table.cc DEPS common_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -251,6 +251,30 @@ int32_t CommonSparseTable::initialize_value() {
    auto shard = std::make_shared<ValueBlock>(common, &initializers_);
    shard_values_.emplace_back(shard);
  }
+
+  auto accessor = _config.accessor();
+
+  std::vector<uint64_t> feasigns;
+
+  for (size_t x = 0; x < accessor.fea_dim(); ++x) {
+    if (x % _shard_num == _shard_idx) {
+      feasigns.push_back(x);
+    }
+  }
+
+  VLOG(0) << "has " << feasigns.size() << " ids need to be pre inited";
+
+  auto buckets = bucket(feasigns.size(), 10);
+  for (int x = 0; x < 10; ++x) {
+    auto bucket_feasigns = buckets[x + 1] - buckets[x];
+    std::vector<uint64_t> ids(bucket_feasigns);
+    std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
+              ids.begin());
+    std::vector<float> pulls;
+    pulls.resize(bucket_feasigns * param_dim_);
+    pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
+  }
+
  return 0;
 }


--- a/paddle/fluid/distributed/table/depends/initializers.h
+++ b/paddle/fluid/distributed/table/depends/initializers.h
@@ -34,6 +34,18 @@ class Initializer {

  virtual float GetValue() = 0;

+  virtual void GetValue(std::vector<float> *values, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      values->push_back(GetValue());
+    }
+  }
+
+  virtual void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = GetValue();
+    }
+  }
+
  virtual ~Initializer() {}

 protected:
@@ -54,6 +66,11 @@ class UniformInitializer : public Initializer {
  }

  float GetValue() override { return dist_(*random_engine_); }
+  void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = dist_(*random_engine_);
+    }
+  }

 private:
  float min_;
@@ -77,6 +94,11 @@ class GaussianInitializer : public Initializer {
  }

  float GetValue() override { return dist_(*random_engine_); }
+  void GetValue(float *value, int numel) {
+    for (int x = 0; x < numel; ++x) {
+      value[x] = dist_(*random_engine_);
+    }
+  }

 private:
  float std_;
@@ -94,6 +116,7 @@ class FillConstantInitializer : public Initializer {
  }

  float GetValue() override { return value_; }
+  void GetValue(float *value, int numel) { std::fill_n(value, numel, value_); }

 private:
  float value_;

--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -68,7 +68,7 @@ inline bool entry<float>(const int count, const float threshold) {

 struct VALUE {
  explicit VALUE(const std::vector<std::string> &names)
-      : names_(names), count_(0), unseen_days_(0) {
+      : names_(names), count_(1), unseen_days_(0), seen_after_last_save_(true) {
    values_.resize(names.size());
    for (int i = 0; i < static_cast<int>(names.size()); i++) {
      places[names[i]] = i;
@@ -79,6 +79,14 @@ struct VALUE {
    values_ = std::move(*values);
  }

+  void set(const std::vector<Initializer *> &inits, std::vector<int> numels) {
+    for (int x = 0; x < numels.size(); ++x) {
+      auto &value = values_[x];
+      value.resize(numels[x]);
+      inits[x]->GetValue(value.data(), numels[x]);
+    }
+  }
+
  void set(const std::vector<std::string> &names,
           const std::vector<std::vector<float>> &values) {
    for (int i = 0; i < static_cast<int>(names.size()); i++) {
@@ -117,8 +125,8 @@ struct VALUE {

  std::vector<std::string> names_;
  int count_;
-  bool seen_after_last_save_;
  int unseen_days_;
+  bool seen_after_last_save_;
  bool is_entry_;
  std::vector<std::vector<float>> values_;
  std::unordered_map<std::string, int> places;
@@ -139,15 +147,20 @@ class ValueBlock {
      value_dims_.push_back(dim);
    }

+    for (auto &name : value_names_) {
+      initializer_list_.emplace_back(initializers_->at(name));
+    }
+
    // for Entry
    {
      // entry will add later
      std::string entry_attr = "none";
-
      if (entry_attr == "none") {
+        has_entry = false;
        entry_func_ =
            std::bind(entry<std::string>, std::placeholders::_1, "none");
      } else {
+        has_entry = true;
        auto slices = string::split_string<std::string>(entry_attr, "&");
        if (slices[0] == "count_filter") {
          int threshold = std::stoi(slices[1]);
@@ -181,6 +194,22 @@ class ValueBlock {
    values_[id] = value;
  }

+  void Init(const uint64_t &id, const std::vector<Initializer *> &inits,
+            int count) {
+    if (Has(id)) {
+      PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
+    }
+
+    if (inits.size() != value_names_.size()) {
+      PADDLE_THROW(
+          platform::errors::AlreadyExists("values can not match, error"));
+    }
+
+    auto value = new VALUE(value_names_);
+    value->set(inits, value_dims_);
+    values_[id] = value;
+  }
+
  std::vector<std::vector<float> *> Get(
      const uint64_t &id, const std::vector<std::string> &value_names) {
    auto ret_values = values_.at(id)->get(value_names);
@@ -195,27 +224,12 @@ class ValueBlock {
  void InitFromInitializer(const uint64_t &id,
                           const std::vector<std::string> &value_names) {
    if (Has(id)) {
+      if (has_entry) {
        Update(id);
-      return;
-    }
-
-    auto rets = std::vector<std::vector<float>>();
-    rets.resize(value_names_.size());
-
-    for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
-      auto name = value_names_[i];
-      auto *init = initializers_->at(name);
-
-      auto dim = value_dims_[i];
-      rets[i].resize(dim);
-
-      for (int j = 0; j < static_cast<int>(dim); j++) {
-        rets[i][j] = init->GetValue();
      }
+      return;
    }
-
-    Init(id, &rets, 0);
-    Update(id);
+    Init(id, initializer_list_, 1);
  }

  bool GetEntry(const uint64_t &id) {
@@ -254,10 +268,12 @@ class ValueBlock {
  std::unordered_map<uint64_t, VALUE *> values_;

 private:
+  bool has_entry = false;
  std::vector<std::string> value_names_;
  std::vector<int> value_dims_;
  std::function<bool(uint64_t)> entry_func_;
  std::unordered_map<std::string, Initializer *> *initializers_;
+  std::vector<Initializer *> initializer_list_;
 };

 }  // namespace distributed

--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -22,14 +22,12 @@
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
-#include "paddle/fluid/distributed/table/tensor_table.h"

 namespace paddle {
 namespace distributed {

 REGISTER_CLASS(Table, CommonDenseTable);
 REGISTER_CLASS(Table, CommonSparseTable);
-REGISTER_CLASS(Table, DenseTensorTable);
 REGISTER_CLASS(Table, SparseGeoTable);
 REGISTER_CLASS(Table, BarrierTable);


--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
-if(APPLE)
-    return()
-endif()
-
 set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

 set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

-set_source_files_properties(sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(sparse_table_test SRCS sparse_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
-
-set_source_files_properties(geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(geo_table_test SRCS geo_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
-
 set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})

-
-# open it until CI support brpc
-return()
-
 set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})


--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -120,7 +120,7 @@ TEST(CommonDenseTable, Adam) {
    beta2_pow[0] *= beta2;
  }
  for (int j = 0; j < fea_dim; j++) {
-    ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-6);
+    ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-5);
  }
 }


--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -62,7 +62,7 @@ TEST(SparseGeoTable, SSUM) {
  std::vector<float> pull_values(init_values.size());
  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
-    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-6);
+    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
  }

  std::vector<std::vector<uint64_t>> trainer_keys;

--- a/paddle/fluid/distributed/test/large_scale_test.cc
+++ b/paddle/fluid/distributed/test/large_scale_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ThreadPool.h>
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+namespace paddle {
+namespace distributed {
+
+TEST(BENCHMARK, LargeScaleKV) {
+  int emb_dim = 10;
+  int trainers = 2;
+  float beta1 = 0.9;
+  float beta2 = 0.999;
+  float epsilon = 1.0e-8;
+
+  TableParameter table_config;
+  table_config.set_table_class("CommonSparseTable");
+  FsClientParameter fs_config;
+  Table *table = new CommonSparseTable();
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CommMergeAccessor");
+  CommonAccessorParameter *common_config = table_config.mutable_common();
+  common_config->set_name("adam");
+  common_config->set_table_name("adam_test_table");
+  common_config->set_trainer_num(trainers);
+  common_config->add_params("Param");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("uniform_random&0&-1.0&1.0");
+  common_config->add_params("LearningRate");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  common_config->add_params("Moment1");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&0.0");
+  common_config->add_params("Moment2");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&0.0");
+  common_config->add_params("Beta1Pow");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  common_config->add_params("Beta2Pow");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0");
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+}
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -218,16 +218,16 @@ if(WITH_DISTRIBUTE)
    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
            dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
            heterxpu_trainer.cc
-    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-    heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+            data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
+            heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
            pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-    device_context scope framework_proto trainer_desc_proto glog fs shell
-    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
-    lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
-    graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto)
+            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
+            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  endif()
 elseif(WITH_PSLIB)
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
@@ -239,11 +239,7 @@ elseif(WITH_PSLIB)
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor pslib_brpc )
-  # TODO: Fix these unittest failed on Windows
-  # This unittest will always failed, now no CI will run this unittest
-  if(NOT WITH_MUSL AND NOT WIN32)
-    cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
-  endif()
+
 else()
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -254,11 +250,6 @@ else()
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor)
-  # TODO: Fix these unittest failed on Windows
-  # This unittest will always failed, now no CI will run this unittest
-  if(NOT WITH_MUSL AND NOT WIN32)
-    cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
-  endif()
 endif()

 target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,10 +15,10 @@ cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_he
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

 if(WITH_DISTRIBUTE)
-    if(NOT WITH_GRPC)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    endif()
+    set_source_files_properties(threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 endif()


@@ -36,7 +36,7 @@ if(WITH_GPU)

    if(WITH_DISTRIBUTE)
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
+                ddim dynload_cuda selected_rows_functor)
    else()
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim dynload_cuda selected_rows_functor)
@@ -52,7 +52,7 @@ else()
            variable_visitor place device_memory_aligment)
    if(WITH_DISTRIBUTE)
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_rpc)
+                ddim selected_rows_functor)
    else()
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim selected_rows_functor)
@@ -85,9 +85,7 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
 cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)

 set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
-if(WITH_DISTRIBUTE)
-    list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator)
-endif()
+
 cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})

 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory

--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/variable_helper.h"

 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/distributed/service/communicator.h"
 #endif

 namespace paddle {
@@ -43,40 +43,7 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
 }

 // get CommContext and remote send and recv op
-void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-  bool need_communicator = false;
-
-  for (auto &node : graphs[0]->Nodes()) {
-    VLOG(3) << "node name " << node->Name();
-    if (node && node->IsOp()) {
-      if (node->Name() == "send") {
-        auto send_varnames =
-            BOOST_GET_CONST(std::vector<std::string>,
-                            node->Op()->GetNullableAttr("send_varnames"));
-
-        if (send_varnames.size() > 0) {
-          need_communicator = true;
-          break;
-        }
-      }
-    }
-  }
-
-  if (need_communicator) {
-    // init communicator here
-    auto *instance = operators::distributed::Communicator::GetInstance();
-    auto initialized = instance ? true : false;
-    PADDLE_ENFORCE_EQ(initialized, true,
-                      platform::errors::InvalidArgument(
-                          "Communicator is not Initialized, you may use "
-                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
-                          "develop/markdown_doc/transpiler)"));
-  }
-
-#endif
-}
+void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { return; }

 AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
@@ -171,12 +138,12 @@ FetchResultType AsyncSSAGraphExecutor::Run(
                        "results to be fetched!"));
  // init once
  if (run_futures_.size() == 0 && places_.size() > 1) {
-    if (strategy_.thread_barrier_) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
+    if (strategy_.thread_barrier_) {
+      paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
          places_.size());
-#endif
    }
+#endif
    exception_holder_.Clear();
    StartOffPythonTrainLoop(return_merged);
  }

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -19,11 +19,6 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#endif
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -51,106 +46,6 @@ void ReduceOpHandle::Wait(
  }
 }

-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-template <typename DevCtx, typename DataType>
-void ReduceOpHandle::GatherSelectedRows(
-    const std::vector<const SelectedRows *> &src_selected_rows,
-    const std::vector<platform::Place> &in_places,
-    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
-    VarHandle *out_var_handle, const platform::Place &out_place,
-    SelectedRows *dst_selected_rows) {
-  const CollectiveContext &collective_context =
-      *CollectiveContext::GetInstance();
-
-  // 1. gather local selected rows, merge them
-  std::string gathered_var_name = out_var_handle->name() + "_gathered_tmp";
-  auto scope = local_scopes_.at(out_var_handle->scope_idx());
-  auto gathered_var_mid = scope->Var(gathered_var_name);
-  auto gathered_select_rows =
-      gathered_var_mid->GetMutable<framework::SelectedRows>();
-  GatherLocalSelectedRowsFunctor functor(
-      src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
-  WaitInputVarGenerated();
-  functor();
-
-  // FIXME(gongwb): remove this Wait.
-  Wait(dev_ctxes);
-
-  // merge them
-  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
-  std::string merged_var_name =
-      GetRemoteVarName(out_var_handle->name(), collective_context.trainer_id_);
-  auto merged_select_rows =
-      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
-  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
-  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
-
-  // 2. start collective server if it doesn't exist
-  operators::distributed::CollectiveServer *server =
-      operators::distributed::CollectiveServer::GetInstance(
-          collective_context.endpoints_[collective_context.trainer_id_],
-          collective_context.endpoints_.size() - 1);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar(merged_var_name,
-                          operators::distributed::kRequestGetMonomerVariable,
-                          scope, merged_dev_ctx);
-
-  // 3. gather them from all remote nodes.
-  std::vector<const SelectedRows *> remote;
-  operators::distributed::CollectiveClient *client =
-      operators::distributed::CollectiveClient::GetInstance();
-
-  std::vector<operators::distributed::RemoteVar> vars;
-  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
-    if (i == (unsigned)collective_context.trainer_id_) continue;
-
-    operators::distributed::RemoteVar var;
-    var.trainer_id_ = i;
-    var.var_name_ = GetRemoteVarName(out_var_handle->name(), i);
-    var.ep_ = collective_context.endpoints_[i];
-
-    vars.push_back(var);
-    VLOG(4) << "gather from:" << var.String();
-  }
-
-  // erase gathered vars
-  merged_dev_ctx->Wait();
-  scope->EraseVars(std::vector<std::string>{gathered_var_name});
-
-  PADDLE_ENFORCE_EQ(
-      client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
-      platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
-  PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
-                    platform::errors::PreconditionNotMet(
-                        "The number of remotes should be equal to the number "
-                        "of variables to be gathered, but got the number of "
-                        "remotes is %d and the number of variables is %d.",
-                        remote.size(), vars.size()));
-
-  // 4. merged local selected rows.
-  std::vector<const SelectedRows *> all;
-  all.resize(collective_context.endpoints_.size());
-  for (auto v : vars) {
-    all[v.trainer_id_] =
-        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
-  }
-  all[collective_context.trainer_id_] = merged_select_rows;
-
-  merge_func(*merged_dev_ctx, all, dst_selected_rows);
-
-  rpc_server->WaitVarBarrier(merged_var_name);
-  rpc_server->ClearVar(merged_var_name);
-
-  // 5. clear mid vars
-  std::vector<std::string> tmp_vars{merged_var_name};
-  for (auto r : vars) {
-    tmp_vars.push_back(r.var_name_);
-  }
-  scope->EraseVars(tmp_vars);
-}
-#endif
-
 void ReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());

@@ -241,25 +136,6 @@ void ReduceOpHandle::RunImpl() {
        functor();
        return;
      }
-
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-      if (in_selected_rows[0]->value().type() ==
-          framework::proto::VarType::FP32) {
-        GatherSelectedRows<platform::CUDADeviceContext, float>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else if (in_selected_rows[0]->value().type() ==
-                 framework::proto::VarType::FP64) {
-        GatherSelectedRows<platform::CUDADeviceContext, double>(
-            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
-            out_var->GetMutable<framework::SelectedRows>());
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Only support double or float when gather SelectedRows, but got "
-            "%s.",
-            framework::DataTypeToString(in_selected_rows[0]->value().type())));
-      }
-#endif
    });
  } else {
    std::vector<const LoDTensor *> lod_tensors =

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/platform/profiler.h"

 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/communicator.h"
+#include "paddle/fluid/distributed/service/communicator.h"
 #endif

 namespace paddle {
@@ -362,14 +362,11 @@ void ThreadedSSAGraphExecutor::ExecutionFinal(
    std::vector<OpHandleBase *> *fetch_ops) {
 #ifdef PADDLE_WITH_DISTRIBUTE
  if (strategy_.thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
-
  VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
  ClearFetchOp(graph_, fetch_ops);
-
  exception_holder_.ReThrow();
 }


--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -91,13 +90,13 @@ Executor::~Executor() {
 }

 void Executor::Close() {
-#ifdef PADDLE_WITH_DISTRIBUTE
-  // TODO(typhoonzero): complete message will need to use real trainer_id,
-  // except 0.
-  auto client =
-      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  client->SendComplete();
-#endif
+  // #ifdef PADDLE_WITH_DISTRIBUTE
+  //   // TODO(typhoonzero): complete message will need to use real trainer_id,
+  //   // except 0.
+  //   auto client =
+  //       paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  //   client->SendComplete();
+  // #endif
 }

 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,

--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -16,10 +16,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"

+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif
+
 namespace paddle {
 namespace framework {

@@ -185,8 +188,7 @@ void HogwildWorker::TrainFilesWithProfiler() {

 #ifdef PADDLE_WITH_DISTRIBUTE
  if (thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
 }
@@ -216,8 +218,7 @@ void HogwildWorker::TrainFiles() {
  }
 #ifdef PADDLE_WITH_DISTRIBUTE
  if (thread_barrier_) {
-    operators::distributed::Communicator::GetInstance()
-        ->BarrierTriggerDecrement();
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
  }
 #endif
 }

--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -48,7 +51,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,

 #ifdef PADDLE_WITH_DISTRIBUTE
  if (trainer_desc.thread_barrier()) {
-    operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
+    paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
        thread_num_);
  }
 #endif

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -77,8 +77,13 @@ set(SHARED_INFERENCE_SRCS
    ${mkldnn_quantizer_src_file})

 # Create shared inference library defaultly
-cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+if(NOT WITH_DISTRIBUTE)
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
      DEPS ${fluid_modules} analysis_predictor)
+else()
+  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
+      DEPS ${fluid_modules} analysis_predictor fleet ps_service)
+endif()

 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
 #!/bin/sh

+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi

-num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
+num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
+num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -v brpc | grep -c "T " )

 if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
 if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -20,9 +20,9 @@ add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(jit)

+
 if(WITH_DISTRIBUTE)
-    add_subdirectory(distributed)
-    add_subdirectory(distributed_ops)
+    add_subdirectory(pscore)
    add_subdirectory(collective)
 endif()

@@ -50,10 +50,6 @@ if (WITH_GPU)
    endif()
 endif()

-SET(OP_PREFETCH_DEPS "")
-if (WITH_DISTRIBUTE)
-    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
-endif()

 SET(OP_MKL_DEPS "")
 if (NOT WITH_MKL OR NOT WITH_AVX)
@@ -70,9 +66,9 @@ if(WITH_UNITY_BUILD)
 endif()

 register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-    sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+        sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})

-op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})

 if (WITH_GPU)
    # warpctc_op needs cudnn 7 above
@@ -86,9 +82,10 @@ if (WITH_GPU)
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
-op_library(lstm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} lstm_compute)
-op_library(eye_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
-op_library(recurrent_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+
+op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(eye_op DEPS ${OP_HEADER_DEPS})
+op_library(recurrent_op DEPS ${OP_HEADER_DEPS})

 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

@@ -163,5 +160,5 @@ if(WITH_UNITY_BUILD)
    # Using Unity Build to compile operators, `register_operator` will cause
    # the unity library to lose some symbols.
    # The specified link dependency needs to be displayed here.
-    target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} ${COMMON_OP_DEPS})
+    target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
 endif()
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
 include(operators)

 set(COLLECTIVE_DEPS "")
-if(WITH_GRPC)
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
-else()
-    set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} ibverbs rdmacm)
-    endif()
-endif()

 set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")


--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be allreduced.");
+    AddOutput("Out", "(Tensor) the result of allreduced.");
+    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "sync_mode",
+        "(bool) whether to synchronize the CUDA stream after nccl call.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+***AllReduce Operator***
+
+Call NCCL AllReduce internally. Note that this op must be used when one
+thread is managing one GPU device.
+
+For speed reasons, reduce_type should be an integer:
+
+0: sum
+1: prod
+2: max
+3: min
+
+If input and output are the same variable, in-place allreduce will be used.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
+                             ops::AllReduceOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AllReduceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "AllReduce op can run on gpu place only for now."));
+#if defined(PADDLE_WITH_NCCL)
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int dtype = platform::ToNCCLDataType(in->type());
+    int64_t numel = in->numel();
+    auto* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    auto* comm = dev_ctx.nccl_comm();
+    // FIXME(typhoonzero): should use nccl stream here.
+    auto stream = dev_ctx.stream();
+    PADDLE_ENFORCE_NOT_NULL(
+        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
+
+    int reduce_type = ctx.Attr<int>("reduce_type");
+    ncclRedOp_t red_type = ncclSum;
+    switch (reduce_type) {
+      case 0:
+        red_type = ncclSum;
+        break;
+      case 1:
+        red_type = ncclProd;
+        break;
+      case 2:
+        red_type = ncclMax;
+        break;
+      case 3:
+        red_type = ncclMin;
+        break;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
+        comm, stream));
+    if (ctx.Attr<bool>("sync_mode")) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/broadcast_op.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <ostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class BroadcastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of BroadcastOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Output) of ConvOp should not be null."));
+  }
+};
+
+class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be broadcast.");
+    AddOutput("Out", "(Tensor) the result of broadcast.");
+    AddAttr<bool>(
+        "sync_mode",
+        "(bool) whether to synchronize the CUDA stream after nccl call.")
+        .SetDefault(false);
+    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
+    AddComment(R"DOC(
+***Broadcast Operator***
+
+Call NCCL Broadcast internally. Note that this op must be used when one
+thread is managing one GPU device.
+)DOC");
+  }
+};
+
+template <typename T>
+class BroadcastOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
+                             ops::BroadcastOpMaker);
+
+REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
+                       ops::BroadcastOpKernel<double>,
+                       ops::BroadcastOpKernel<int>,
+                       ops::BroadcastOpKernel<int64_t>,
+                       ops::BroadcastOpKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet(
+            "The place of ExecutionContext should be CUDAPlace."));
+
+#if defined(PADDLE_WITH_NCCL)
+    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
+    int root_dev_id = ctx.Attr<int>("root");
+
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE_EQ(
+        out->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "Currently, the output of broadcast op must be initialized,"
+            "because this op can only be an In-Place operation."));
+    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        send_recv_buffer, in->data<void>(),
+        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
+                                             "only be an In-Place operation."));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto comm = dev_ctx.nccl_comm();
+    auto stream = dev_ctx.stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+        send_recv_buffer, static_cast<size_t>(in->numel()),
+        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
+
+    VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
+            << " From " << root_dev_id << " to " << dev_id;
+
+    if (ctx.Attr<bool>("sync_mode")) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
+                        ops::NCCLBroadcastOpKernel<double>,
+                        ops::NCCLBroadcastOpKernel<int>,
+                        ops::NCCLBroadcastOpKernel<int64_t>,
+                        ops::NCCLBroadcastOpKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
+return()

 if(WITH_GRPC)
    set(cc_generic_services "false")

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -28,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,10 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -26,10 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#endif
-
 namespace paddle {
 namespace operators {

@@ -187,72 +183,7 @@ class NCEKernel : public framework::OpKernel<T> {
    // forward mul
    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));

-    // for remote prefetch
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-
-    if (remote_prefetch && !epmap.empty()) {
-      // if epmap is not empty, then the parameter will be fetched from remote
-      // parameter
-      // server
-
-      std::vector<int64_t> labels;
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        labels.push_back(sample_labels_data[i]);
-      }
-      std::set<T> st(labels.begin(), labels.end());
-      labels.assign(st.begin(), st.end());
-
-      framework::Scope &local_scope = context.scope().NewScope();
-
-      auto table_names = context.Attr<std::vector<std::string>>("table_names");
-
-      auto *ids = local_scope.Var("Ids@Prefetch");
-      auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
-      x_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
-          context.GetPlace());
-      // copy.
-      std::memcpy(x_tensor->data<int64_t>(), labels.data(),
-                  labels.size() * sizeof(int64_t));
-
-      std::vector<int> w_dims = paddle::framework::vectorize<int>(
-          context.Input<Tensor>("Weight")->dims());
-      w_dims[0] = static_cast<int>(labels.size());
-
-      auto *w_tensor = local_scope.Var("Weight@Prefetch")
-                           ->GetMutable<framework::LoDTensor>();
-      w_tensor->Resize(framework::make_ddim(w_dims));
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-      auto weight = context.InputNames("Weight").front();
-      operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
-                                       weight, false, table_names, epmap,
-                                       context, local_scope);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!"));
-#endif
-
-      auto weight_mat = EigenMatrix<T>::From(
-          (local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        std::vector<int64_t>::iterator it =
-            std::find(labels.begin(), labels.end(), sample_labels_data[i]);
-        int idx = std::distance(labels.begin(), it);
-
-        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-             weight_mat.chip(idx, 0))
-                .sum();
-        sample_out_data[i] += result(0);
-        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
-      }
-      context.scope().DeleteScope(&local_scope);
-    } else {
-      auto weight_mat =
-          EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
@@ -261,7 +192,6 @@ class NCEKernel : public framework::OpKernel<T> {
      sample_out_data[i] += result(0);
      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
    }
-    }

    // forward cost
    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {

--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
+include(operators)
+
+set(DISTRIBUTE_DEPS "")
+
+list(APPEND DISTRIBUTE_DEPS fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy)
+
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS
+            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif()
+
+file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+list(REMOVE_DUPLICATES OPS)
+
+foreach (src ${OPS})
+    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endforeach ()
+
+register_operators()
+
+set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
+
+set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op)
+
+set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS})
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr int64_t kNoPadding = -1;
+
+class DistributedLookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Ids) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Outs) of LookupTableOp should not be null."));
+
+    auto ids_dims = ctx->GetInputsDim("Ids");
+    auto table_dims = ctx->GetInputDim("W");
+
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Only 2 dimensions of the 'Embedding' is supported."));
+
+    for (auto &ids_dim : ids_dims) {
+      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
+                        platform::errors::InvalidArgument(
+                            "The dimension of the 'Ids' tensor must be 2."));
+    }
+
+    // for fluid.embedding
+    auto lookup_table_version =
+        ctx->Attrs().Get<std::string>("lookup_table_version");
+
+    auto outputs_dims = std::vector<framework::DDim>();
+
+    for (auto &ids_dim : ids_dims) {
+      if (lookup_table_version == "lookup_table") {
+        outputs_dims.push_back(
+            framework::make_ddim({ids_dim[0], table_dims[1]}));
+      } else if (lookup_table_version == "lookup_table_v2") {
+        outputs_dims.push_back(framework::make_ddim(
+            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
+             static_cast<int64_t>(table_dims[1])}));
+      }
+    }
+
+    ctx->SetOutputsDim("Outputs", outputs_dims);
+    ctx->ShareLoD("Ids", /*->*/ "Outputs");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids",
+             "(LoDTensor) Ids's type should be LoDTensor"
+             "THe ids to be looked up in W.")
+        .AsDuplicable();
+
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+
+    AddOutput("Outputs",
+              "(LoDTensor) The lookup results, which have the same type as W.")
+        .AsDuplicable();
+
+    AddAttr<int>("table_id", "sparse table id").SetDefault(0);
+
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "lookup_table_version",
+        "(string, default lookup_table) "
+        "To distinguish between different versions of embedding OP")
+        .SetDefault(std::string("lookup_table"));
+
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+
+    AddComment(R"DOC(
+Lookup Tablel Prefetch Operator.
+This operator is used to perform lookup on parameter W,
+then concatenated into a sparse tensor.
+The type of Ids(Input) is SelectedRows, the rows of Ids contains
+the ids to be looked up in W;
+if the Id is not in the sparse table, this operator will return a
+random value and set the value into the table for the next looking up.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
+                  ops::DistributedLookupTableOpMaker);
+
+REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
+                       ops::DistributedLookupTableKernel<
+                           paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    distributed_lookup_table,
+    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DistributedLookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &scope = context.scope();
+
+    auto padding_idx = context.Attr<int64_t>("padding_idx");
+    auto table_id = context.Attr<int>("table_id");
+
+    auto embedding_name = context.InputNames("W").front();
+    int64_t emb_dim = 0;
+
+    auto *var = scope.FindVar(embedding_name);
+
+    if (var->IsType<framework::LoDTensor>()) {
+      emb_dim = var->Get<framework::LoDTensor>().dims()[1];
+    } else if (var->IsType<framework::SelectedRows>()) {
+      emb_dim = var->Get<framework::SelectedRows>().value().dims()[1];
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of `W` must be Tensor, SelectedRows.But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(var->Type())));
+    }
+
+    auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
+    auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
+
+    auto fleet = distributed::FleetWrapper::GetInstance();
+
+    if (platform::is_cpu_place(context.GetPlace())) {
+      fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
+                                    static_cast<uint64_t>(padding_idx),
+                                    context.GetPlace(), &inputs, &outputs);
+    } else {
+      auto inputs_variable = context.MultiInputVar("Ids");
+      auto outputs_variable = context.MultiOutputVar("Outputs");
+      auto inputs_name = context.InputNames("Ids");
+      auto outputs_name = context.OutputNames("Outputs");
+
+      auto cpu_place = platform::CPUPlace();
+      framework::Scope *tmp_scope = scope.NewTmpScope().release();
+
+      std::vector<const framework::LoDTensor *> tmp_input_vec;
+      auto input_var_size = inputs_variable.size();
+      std::vector<framework::LoDTensor *> tmp_output_vec;
+      auto output_var_size = outputs_variable.size();
+
+      // create temp input
+      for (size_t idx = 0; idx < input_var_size; ++idx) {
+        framework::Variable *tmp_input_var = tmp_scope->Var(inputs_name[idx]);
+        framework::LoDTensor *tmp_input_tensor =
+            tmp_input_var->GetMutable<framework::LoDTensor>();
+        framework::TensorCopy(inputs_variable[idx]->Get<framework::LoDTensor>(),
+                              cpu_place, context.device_context(),
+                              tmp_input_tensor);
+        tmp_input_vec.push_back(tmp_input_tensor);
+      }
+
+      // create temp output
+      for (size_t idx = 0; idx < output_var_size; ++idx) {
+        framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
+        framework::LoDTensor *tmp_output_tensor =
+            tmp_output_var->GetMutable<framework::LoDTensor>();
+        tmp_output_tensor->Resize(outputs[idx]->dims());
+        tmp_output_vec.push_back(tmp_output_tensor);
+      }
+
+      // use fleet->PullSparse
+      fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
+                                    static_cast<uint64_t>(padding_idx),
+                                    cpu_place, &tmp_input_vec, &tmp_output_vec);
+
+      // cp temp to origin
+      for (size_t idx = 0; idx < output_var_size; ++idx) {
+        framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
+        framework::LoDTensor *tmp_output_tensor =
+            tmp_output_var->GetMutable<framework::LoDTensor>();
+        framework::TensorCopy(
+            *tmp_output_tensor, context.GetPlace(), context.device_context(),
+            outputs_variable[idx]->GetMutable<framework::LoDTensor>());
+      }
+      delete tmp_scope;
+    }
+
+    auto id_names = context.InputNames("Ids");
+    auto out_names = context.OutputNames("Outputs");
+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
+
+    if (lookup_table_version == "lookup_table_v2") {
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto *id_var = scope.FindVar(id_names[i]);
+        auto *out_var = scope.FindVar(out_names[i]);
+        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class FakeInitInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FakeInitOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "fake init op's output only"
+          "supports SelectedRows and LoDTensor"));
+    }
+  }
+};
+
+class FakeInitOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
+};
+
+class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FakeInit Operator.
+Init an variable but not alloc memory for it, it is used for init the
+table parameter at trainer side in distributed lookup table.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::FakeInitOpVarTypeInference);
--- a/paddle/fluid/operators/pscore/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/fetch_barrier_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace distributed {
+class Communicator;
+}  // namespace distributed
+
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    VLOG(4) << "FetchBarrier Sync, do not need now";
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDispensable()
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fetch_barrier, ops::FetchBarrierOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+
+namespace paddle {
+namespace operators {
+
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+HeterListenAndServOp::HeterListenAndServOp(
+    const std::string &type, const framework::VariableNameMap &inputs,
+    const framework::VariableNameMap &outputs,
+    const framework::AttributeMap &attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
+
+HeterListenAndServOp::~HeterListenAndServOp() { Stop(); }
+
+void HeterListenAndServOp::Stop() {}
+
+void HeterListenAndServOp::RunAsyncLoop(framework::Executor *executor,
+                                        framework::ProgramDesc *program,
+                                        framework::Scope *recv_scope) const {
+  VLOG(2) << "RunAsyncLoop";
+  auto message_to_block_id_str =
+      Attr<std::vector<std::string>>("message_to_block_id");
+  DoubleFindMap<std::string, int32_t> message_to_block_id;
+
+  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
+                              const std::string &grad_and_id) {
+    std::vector<std::string> pieces;
+    split(grad_and_id, ':', &pieces);
+    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of message_and_id argument. "
+                          "Expected \"message:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The message name %s has already existed in out_map",
+                          pieces[0].c_str()));
+
+    int block_id = std::stoi(pieces[1]);
+    (*out_map)[pieces[0]] = block_id;
+  };
+
+  for (const auto &message_and_id : message_to_block_id_str) {
+    append_block_maps(&message_to_block_id, message_and_id);
+  }
+
+  size_t num_blocks = program->Size();
+  PADDLE_ENFORCE_GE(num_blocks, 1,
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 1. Recieved %zu",
+                        num_blocks));
+  std::vector<int> block_list;
+  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+    block_list.push_back(blkid);
+  }
+  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed, block id 1 in the program is global
+  // block if it's not bind to a grad var for it's update.
+  if (block_list[0] == 1 &&
+      message_to_block_id.find_value(static_cast<int32_t>(1)) ==
+          message_to_block_id.end()) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      message_to_prepared_ctx;
+  for (size_t i = 0; i < block_list.size(); ++i) {
+    auto blkid = block_list[i];
+    auto it = message_to_block_id.find_value(blkid);
+    if (it != message_to_block_id.end()) {
+      message_to_prepared_ctx[it->first] = optimize_prepared[i];
+    }
+  }
+
+  request_send_and_recv_handler_->SetGradToPreparedCtx(
+      &message_to_prepared_ctx);
+
+  for (size_t i = 0; i < block_list.size(); ++i) {
+    auto blkid = block_list[i];
+    auto it = message_to_block_id.find_value(blkid);
+    rpc_service_->RegisterServiceHandler(
+        it->first, [&](const MultiVarMsg *request, MultiVarMsg *response,
+                       brpc::Controller *cntl) -> int {
+          return request_send_and_recv_handler_->Handle(request, response,
+                                                        cntl);
+        });
+  }
+
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      rpc_service_->Stop();
+      VLOG(0) << "get exit. rpc_processor stop!";
+      break;
+    }
+    sleep(1);
+  }  // while(true)
+}
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
+  service->StartHeterService();
+}
+
+void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
+                                   const platform::Place &dev_place) const {
+  // Mark this as PS that it should decide profiling by listening from trainer.
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  VLOG(1) << "HeterListenAndServOp::RunImpl On gpu? "
+          << platform::is_gpu_place(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+
+  auto pserver_id = Attr<int>("pserver_id");
+  auto fan_in = Attr<int>("fanin");
+  auto inputs = Inputs("X");
+
+  PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
+                    platform::errors::PreconditionNotMet(
+                        "RPC service has been created unexpectedly."));
+  std::string endpoint = Attr<std::string>("endpoint");
+
+  VLOG(4) << "pserver_id: " << pserver_id << ", end_point:" << endpoint;
+
+  rpc_service_ = distributed::HeterServer::GetInstance();
+  rpc_service_->SetEndPoint(endpoint);
+  rpc_service_->SetFanin(fan_in);
+
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>("optimize_blocks");
+  PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
+                    platform::errors::PreconditionNotMet(
+                        "optimize blocks is less than 1. Optimize blocks "
+                        "should be 1 at least on the pserver side."));
+  auto *program = optimize_blocks[0]->Program();
+  framework::Executor executor(dev_place);
+
+  request_send_and_recv_handler_.reset(
+      new distributed::RequestSendAndRecvHandler());
+  request_send_and_recv_handler_->SetScope(&recv_scope);
+  request_send_and_recv_handler_->SetDevCtx(&dev_ctx);
+  request_send_and_recv_handler_->SetProgram(program);
+  request_send_and_recv_handler_->SetExecutor(&executor);
+
+  VLOG(2) << "RunAsyncLoop";
+  auto message_to_block_id_str =
+      Attr<std::vector<std::string>>("message_to_block_id");
+
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  VLOG(3) << "wait server thread to become ready...";
+  rpc_service_->WaitServerReady();
+  RunAsyncLoop(&executor, program, &recv_scope);
+  VLOG(3) << "Wait for Server_thread_ stop";
+  (server_thread_.get())->join();
+  VLOG(3) << "Server_thread_ stop";
+}
+
+class HeterListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
+    AddComment(
+        R"DOC(" + "HeterListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<int>("pserver_id",
+                 "(int, default -1), the parameter server index id")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(
+        "message_to_block_id",
+        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
+        "a map from message name to it's optimize block id")
+        .SetDefault({});
+    AddAttr<int>("distributed_mode",
+                 "indicate distriubte training mode, 0 is sync, 1 is "
+                 "fully-async, 2 is half-async, 3 is geo")
+        .SetDefault(0);
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        "optimize_blocks", "Optimize blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<int>("fanin", "How many clients send to this server.")
+        .SetDefault(1);
+    AddAttr<int>("rpc_exec_thread_num", "pserver send thread num.")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(heter_listen_and_serv, ops::HeterListenAndServOp,
+                  ops::HeterListenAndServOpMaker);
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+
+template <class TKey, class TValue>
+class DoubleFindMap : public std::unordered_map<TKey, TValue> {
+ public:
+  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
+    return std::find_if(this->begin(), this->end(),
+                        [&v](const std::pair<const std::string, int> p) {
+                          return p.second == v;
+                        });
+  }
+};
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service);
+
+class HeterListenAndServOp : public framework::OperatorBase {
+ public:
+  HeterListenAndServOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs);
+  virtual ~HeterListenAndServOp();
+
+  void RunAsyncLoop(framework::Executor* executor,
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
+
+  void Stop() override;
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override;
+
+ protected:
+  mutable std::shared_ptr<paddle::distributed::HeterServer> rpc_service_;
+  mutable std::shared_ptr<std::thread> server_thread_;
+  mutable std::shared_ptr<paddle::distributed::HeterRequestHandler>
+      request_send_and_recv_handler_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::distributed;
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+DECLARE_double(eager_delete_tensor_gb);
+
+USE_OP(scale);
+USE_NO_KERNEL_OP(heter_listen_and_serv);
+
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
+  framework::BlockDesc* block =
+      program->AppendBlock(*(program->MutableBlock(0)));
+
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto* out = block->Var("res");
+  out->SetType(framework::proto::VarType::LOD_TENSOR);
+  out->SetShape({1, 10});
+
+  return block;
+}
+
+void GetHeterListenAndServProgram(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+
+  auto* sub_block = AppendSendAndRecvBlock(program);
+  std::vector<framework::BlockDesc*> optimize_blocks;
+  optimize_blocks.push_back(sub_block);
+
+  std::vector<std::string> message_to_block_id = {"x:1"};
+  std::string endpoint = "127.0.0.1:19944";
+
+  framework::OpDesc* op = root_block->AppendOp();
+  op->SetType("heter_listen_and_serv");
+  op->SetInput("X", {});
+  op->SetAttr("message_to_block_id", message_to_block_id);
+  op->SetAttr("optimize_blocks", optimize_blocks);
+  op->SetAttr("endpoint", endpoint);
+  op->SetAttr("fanin", 1);
+  op->SetAttr("pserver_id", 0);
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
+
+  auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
+  float* res_ptr =
+      res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+}
+
+void StartHeterServer() {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+
+  LOG(INFO) << "before GetHeterListenAndServProgram";
+  GetHeterListenAndServProgram(&program);
+  auto prepared = exe.Prepare(program, 0);
+
+  LOG(INFO) << "before InitTensorsOnServer";
+  InitTensorsOnServer(&scope, &place, 10);
+
+  LOG(INFO) << "before RunPreparedContext";
+  exe.RunPreparedContext(prepared.get(), &scope, false);
+}
+
+TEST(HETER_LISTEN_AND_SERV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  std::string endpoint = "127.0.0.1:19944";
+  LOG(INFO) << "before StartSendAndRecvServer";
+  FLAGS_eager_delete_tensor_gb = -1;
+  std::thread server_thread(StartHeterServer);
+  sleep(1);
+
+  LOG(INFO) << "before HeterClient::GetInstance";
+  distributed::HeterClient* rpc_client =
+      distributed::HeterClient::GetInstance({endpoint}, 0).get();
+
+  PADDLE_ENFORCE_NE(rpc_client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  LOG(INFO) << "before InitTensorsOnClient";
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+  std::vector<std::string> send_var = {in_var_name};
+  std::vector<std::string> recv_var = {out_var_name};
+
+  LOG(INFO) << "before SendAndRecvAsync";
+  rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
+                               recv_var);
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  LOG(INFO) << "before CHECK";
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    LOG(INFO) << "ptr " << i << " is " << ptr[i];
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  LOG(INFO) << "end CHECK";
+  rpc_client->Stop();
+  LOG(INFO) << "end server Stop";
+  server_thread.join();
+  LOG(INFO) << "end server thread join";
+}
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/distributed/service/brpc_utils.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/distributed/service/heter_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::distributed;
+
+using MultiVarMsg = ::paddle::MultiVariableMessage;
+using VarMsg = ::paddle::VariableMessage;
+
+USE_OP(scale);
+
+std::shared_ptr<distributed::HeterServer> b_rpc_service;
+
+framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::OpDesc* op = block->AppendOp();
+  op->SetType("scale");
+  op->SetInput("X", {"x"});
+  op->SetOutput("Out", {"res"});
+  op->SetAttr("scale", 0.5f);
+
+  auto& out = *root_block->Var("res");
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
+  out.SetShape({1, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::LoDTensor>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::LoDTensor>();
+
+  auto x_var = scope->Var("x");
+  x_var->GetMutable<framework::LoDTensor>();
+
+  auto res_var = scope->Var("res");
+  res_var->GetMutable<framework::LoDTensor>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
+
+  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
+  float* x_ptr =
+      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
+
+  auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
+  float* res_ptr =
+      res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
+void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
+  service->StartHeterService();
+}
+
+void StartSendAndRecvServer(std::string endpoint) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  LOG(INFO) << "before AppendSendAndRecvBlock";
+  auto block = AppendSendAndRecvBlock(&program);
+  std::string in_var_name("x");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+
+  LOG(INFO) << "before InitTensorsOnServer";
+  InitTensorsOnServer(&scope, &place, 10);
+  LOG(INFO) << "end InitTensorsOnServer";
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      message_to_prepared_ctx;
+  message_to_prepared_ctx[in_var_name] = prepared[0];
+
+  std::shared_ptr<distributed::RequestSendAndRecvHandler> b_req_handler;
+  b_req_handler.reset(new distributed::RequestSendAndRecvHandler());
+  LOG(INFO) << "before SetProgram";
+  b_req_handler->SetProgram(&program);
+  LOG(INFO) << "before SetGradToPreparedCtx";
+  b_req_handler->SetGradToPreparedCtx(&message_to_prepared_ctx);
+  LOG(INFO) << "before SetDevCtx";
+  b_req_handler->SetDevCtx(&ctx);
+  LOG(INFO) << "before SetScope";
+  b_req_handler->SetScope(&scope);
+  LOG(INFO) << "before SetExecutor";
+  b_req_handler->SetExecutor(&exe);
+  LOG(INFO) << "before HeterServer::GetInstance";
+  b_rpc_service = distributed::HeterServer::GetInstance();
+  b_rpc_service->SetEndPoint(endpoint);
+  LOG(INFO) << "before HeterServer::RegisterServiceHandler";
+  b_rpc_service->RegisterServiceHandler(
+      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
+                       brpc::Controller* cntl) -> int {
+        return b_req_handler->Handle(request, response, cntl);
+      });
+
+  LOG(INFO) << "before HeterServer::RunServer";
+  std::thread server_thread(std::bind(RunServer, b_rpc_service));
+
+  server_thread.join();
+}
+
+TEST(SENDANDRECV, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  std::string endpoint = "127.0.0.1:4444";
+  LOG(INFO) << "before StartSendAndRecvServer";
+  b_rpc_service = distributed::HeterServer::GetInstance();
+  std::thread server_thread(StartSendAndRecvServer, endpoint);
+  b_rpc_service->WaitServerReady();
+
+  LOG(INFO) << "before HeterClient::GetInstance";
+  distributed::HeterClient* rpc_client =
+      distributed::HeterClient::GetInstance({endpoint}, 0).get();
+
+  PADDLE_ENFORCE_NE(rpc_client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  // create var on local scope
+  int64_t rows_numel = 10;
+  LOG(INFO) << "before InitTensorsOnClient";
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("x");
+  std::string out_var_name("res");
+  std::vector<std::string> send_var = {in_var_name};
+  std::vector<std::string> recv_var = {out_var_name};
+
+  LOG(INFO) << "before SendAndRecvAsync";
+  rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
+                               recv_var);
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::LoDTensor>();
+  auto ptr = value->mutable_data<float>(place);
+
+  LOG(INFO) << "before CHECK";
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    LOG(INFO) << "ptr " << i << " is " << ptr[i];
+    EXPECT_EQ(ptr[i], 0.5);
+  }
+  LOG(INFO) << "end CHECK";
+  rpc_client->FinalizeWorker();
+  // b_rpc_service->Stop();
+  b_rpc_service->Stop();
+  LOG(INFO) << "end server Stop";
+  server_thread.join();
+  LOG(INFO) << "end server thread join";
+}
--- a/paddle/fluid/operators/pscore/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/listen_and_serv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
+constexpr char kCheckpointBlockId[] = "checkpint_block_id";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
+constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string& type,
+                  const framework::VariableNameMap& inputs,
+                  const framework::VariableNameMap& outputs,
+                  const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    VLOG(1) << "just for recorder";
+  }
+};
+
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string& ip) { return !ip.empty(); });
+    AddAttr<int>("pserver_id",
+                 "(int, default -1), the parameter server index id")
+        .SetDefault(-1);
+    AddAttr<std::vector<std::string>>(
+        "grad_to_block_id",
+        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
+        "a map from grad name to it's optimize block id")
+        .SetDefault({});
+    AddAttr<int>("distributed_mode",
+                 "indicate distriubte training mode, 0 is sync, 1 is "
+                 "fully-async, 2 is half-async, 3 is geo")
+        .SetDefault(0);
+    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
+        .SetDefault(false);
+    AddAttr<std::vector<framework::BlockDesc*>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        kSparseGradToParam,
+        "sparse grad name to param name. like: 'emb@Grad:emb'")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "How many clients send to this server.")
+        .SetDefault(1);
+    AddAttr<int>(kCheckpointBlockId,
+                 "BolckID to run save checkpoint on pserer.")
+        .SetDefault(-1);
+    AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
+        .SetDefault(-1);
+    AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
+    AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
+        .SetDefault(1);
+    AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
+        .SetDefault(1);
+  }
+};
+
+class ListenAndServOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    listen_and_serv, ops::ListenAndServOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::ListenAndServOpMaker, ops::ListenAndServOpShapeInference);
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/distributed/service/heter_client.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SendAndRecvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    auto message_name = ctx.Attr<std::string>("message_name");
+    auto send_var_name = ctx.Attr<std::vector<std::string>>("send_var_name");
+    auto recv_var_name = ctx.Attr<std::vector<std::string>>("recv_var_name");
+    auto epmap = ctx.Attr<std::vector<std::string>>("endpoints");
+    auto trainer_id = ctx.Attr<int>("trainer_id");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& context = *pool.Get(place);
+
+    distributed::HeterClient* rpc_client =
+        distributed::HeterClient::GetInstance(epmap, trainer_id).get();
+    VLOG(3) << "SendAndRecvOp message_name: " << message_name;
+    rpc_client->SendAndRecvAsync(epmap, context, scope, message_name,
+                                 send_var_name, recv_var_name);
+  }
+};
+
+class SendAndRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
+    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
+    AddAttr<std::string>("message_name", "");
+    AddAttr<std::vector<std::string>>("send_var_name", "Send Tensor's name");
+    AddAttr<std::vector<std::string>>("recv_var_name", "Recv Tensor's name");
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints", "Server endpoint")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+    SendAndRecv operator
+    This operator will send variables to listen_and_serve op at the parameter server.
+    And recv variable from parameter server of send variable's scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    send_and_recv,
+    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/pscore/send_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/send_barrier_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+
+namespace distributed {
+class Communicator;
+}  // namespace distributed
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    paddle::distributed::Communicator::GetInstance()->Barrier();
+  }
+};
+
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<bool>(
+        "half_async",
+        "(bool, default false)"
+        "half_async=True is for half_async mode, this will send signal "
+        "to HalfAsyncCommunicator Instance")
+        .SetDefault(false);
+  }
+};
+
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    send_barrier, ops::SendBarrierOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
--- a/paddle/fluid/operators/pscore/send_op.cc
+++ b/paddle/fluid/operators/pscore/send_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    // auto is_sparse = Attr<int>("is_sparse");
+    // auto table_id = Attr<int>("table_id");
+
+    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
+
+    auto* communicator = paddle::distributed::Communicator::GetInstance();
+    communicator->Check(send_varnames);
+    communicator->Send(ins, scope);
+
+    // auto fleet = paddle::distributed::FleetWrapper::GetInstance();
+    // if (is_sparse == 0) {
+    //   std::vector<::std::future<int32_t>> status;
+    //   fleet->PushDenseVarsAsync(scope, table_id, send_varnames, &status, 0,
+    //   -1);
+    // } else {
+    //   std::vector<::std::future<int32_t>> status;
+    //   fleet->PushSparseVarsAsync(scope, table_id, send_varnames[0], &status);
+    // }
+  }
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
+        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Send operator
+
+This operator will send variables to listen_and_serve op at the parameter server.
+)DOC");
+    AddAttr<int>("table_id", "table_id for send").SetDefault(0);
+    AddAttr<int>("is_sparse",
+                 "(int, default 0->Dense, 1->Sparse, 2->Distributed)")
+        .SetDefault(0);
+    AddAttr<std::vector<std::string>>(
+        "send_varnames",
+        "(vector<string>) "
+        "the split output varnames to send to pserver")
+        .SetDefault(std::vector<std::string>{});
+  }
+};
+
+class SendOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    send, ops::SendOp,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::SendOpMaker, ops::SendOpShapeInference);
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -20,10 +20,6 @@ if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()

-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_DEPS communicator)
-endif()
-
 set(PYBIND_SRCS
  pybind.cc
  exception.cc
@@ -54,7 +50,10 @@ if (WITH_CRYPTO)
 endif (WITH_CRYPTO)

 if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_SRCS communicator_py.cc)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+  set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  list(APPEND PYBIND_DEPS fleet communicator)
+  list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()

 if (WITH_NCCL)

--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include "paddle/fluid/pybind/fleet_py.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/service/communicator.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/heter_client.h"
+
+namespace py = pybind11;
+using paddle::distributed::CommContext;
+using paddle::distributed::Communicator;
+using paddle::distributed::FleetWrapper;
+using paddle::distributed::HeterClient;
+
+namespace paddle {
+namespace pybind {
+void BindDistFleetWrapper(py::module* m) {
+  py::class_<FleetWrapper, std::shared_ptr<FleetWrapper>>(*m,
+                                                          "DistFleetWrapper")
+      .def(py::init([]() { return FleetWrapper::GetInstance(); }))
+      .def("load_sparse", &FleetWrapper::LoadSparseOnServer)
+      .def("init_server", &FleetWrapper::InitServer)
+      .def("run_server",
+           (uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer)
+      .def("run_server", (uint64_t (FleetWrapper::*)(          // NOLINT
+                             const std::string&, uint32_t)) &  // NOLINT
+                             FleetWrapper::RunServer)
+      .def("init_worker", &FleetWrapper::InitWorker)
+      .def("push_dense_params", &FleetWrapper::PushDenseParamSync)
+      .def("pull_dense_params", &FleetWrapper::PullDenseVarsSync)
+      .def("save_all_model", &FleetWrapper::SaveModel)
+      .def("save_one_model", &FleetWrapper::SaveModelOneTable)
+      .def("sparse_table_stat", &FleetWrapper::PrintTableStat)
+      .def("stop_server", &FleetWrapper::StopServer)
+      .def("stop_worker", &FleetWrapper::FinalizeWorker)
+      .def("barrier", &FleetWrapper::BarrierWithTable);
+}  // end BindDistFleetWrapper
+
+void BindPSHost(py::module* m) {
+  py::class_<distributed::PSHost>(*m, "PSHost")
+      .def(py::init<const std::string&, uint32_t, uint32_t>())
+      .def("serialize_to_string", &distributed::PSHost::serialize_to_string)
+      .def("parse_from_string", &distributed::PSHost::parse_from_string)
+      .def("to_uint64", &distributed::PSHost::serialize_to_uint64)
+      .def("from_uint64", &distributed::PSHost::parse_from_uint64)
+      .def("to_string", &distributed::PSHost::to_string);
+}
+
+void BindCommunicatorContext(py::module* m) {
+  py::class_<CommContext>(*m, "CommContext")
+      .def(
+          py::init<const std::string&, const std::vector<std::string>&,
+                   const std::vector<std::string>&, const std::vector<int64_t>&,
+                   const std::vector<std::string>&, int, bool, bool, bool,
+                   int>())
+      .def("var_name", [](const CommContext& self) { return self.var_name; })
+      .def("trainer_id",
+           [](const CommContext& self) { return self.trainer_id; })
+      .def("table_id", [](const CommContext& self) { return self.table_id; })
+      .def("split_varnames",
+           [](const CommContext& self) { return self.splited_varnames; })
+      .def("split_endpoints",
+           [](const CommContext& self) { return self.epmap; })
+      .def("sections",
+           [](const CommContext& self) { return self.height_sections; })
+      .def("aggregate", [](const CommContext& self) { return self.merge_add; })
+      .def("is_sparse", [](const CommContext& self) { return self.is_sparse; })
+      .def("is_distributed",
+           [](const CommContext& self) { return self.is_distributed; })
+      .def("origin_varnames",
+           [](const CommContext& self) { return self.origin_varnames; })
+      .def("__str__", [](const CommContext& self) { return self.print(); });
+}
+
+using paddle::distributed::AsyncCommunicator;
+using paddle::distributed::GeoCommunicator;
+using paddle::distributed::RecvCtxMap;
+using paddle::distributed::RpcCtxMap;
+using paddle::distributed::SyncCommunicator;
+using paddle::framework::Scope;
+
+void BindDistCommunicator(py::module* m) {
+  // Communicator is already used by nccl, change to DistCommunicator
+  py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
+                                                          "DistCommunicator")
+      .def(py::init([](const std::string& mode, const std::string& dist_desc,
+                       const std::vector<std::string>& host_sign_list,
+                       const RpcCtxMap& send_ctx, const RecvCtxMap& recv_ctx,
+                       Scope* param_scope,
+                       std::map<std::string, std::string>& envs) {
+        if (mode == "ASYNC") {
+          Communicator::InitInstance<AsyncCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else if (mode == "SYNC") {
+          Communicator::InitInstance<SyncCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else if (mode == "GEO") {
+          Communicator::InitInstance<GeoCommunicator>(
+              send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "unsuported communicator MODE"));
+        }
+        return Communicator::GetInstantcePtr();
+      }))
+      .def("stop", &Communicator::Stop)
+      .def("start", &Communicator::Start)
+      .def("push_sparse_param", &Communicator::RpcSendSparseParam)
+      .def("is_running", &Communicator::IsRunning)
+      .def("init_params", &Communicator::InitParams);
+  //  .def("recv", &Communicator::RecvNoBarrier);
+}
+
+void BindHeterClient(py::module* m) {
+  py::class_<HeterClient, std::shared_ptr<HeterClient>>(*m, "HeterClient")
+      .def(py::init(
+          [](const std::vector<std::string>& endpoint, const int& trainer_id) {
+            return HeterClient::GetInstance(endpoint, trainer_id);
+          }))
+      .def("stop", &HeterClient::Stop);
+}
+
+}  // end namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindDistFleetWrapper(py::module* m);
+void BindPSHost(py::module* m);
+void BindCommunicatorContext(py::module* m);
+void BindDistCommunicator(py::module* m);
+void BindHeterClient(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -103,14 +103,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/xpu_info.h"
 #endif

-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/pybind/communicator_py.h"
-#endif
-
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif

+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+
 #include "pybind11/stl.h"

 DECLARE_bool(use_mkldnn);
@@ -2837,10 +2837,13 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_CRYPTO
  BindCrypto(&m);
 #endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
-  BindCommunicator(&m);
+  BindDistFleetWrapper(&m);
+  BindPSHost(&m);
  BindCommunicatorContext(&m);
-  BindLargeScaleKV(&m);
+  BindDistCommunicator(&m);
+  BindHeterClient(&m);
 #endif
 }
 }  // namespace pybind

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -212,7 +212,7 @@ function cmake_base() {
    fi

    if [ "$SYSTEM" == "Darwin" ]; then
-        WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
+        WITH_DISTRIBUTE="OFF"
        WITH_AVX=${WITH_AVX:-ON}
        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
    else
@@ -220,13 +220,8 @@ function cmake_base() {
    fi

    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
-
-    if [ "$SYSTEM" == "Darwin" ]; then
-        gloo_flag="OFF"
-    else
+    grpc_flag="OFF"
    gloo_flag=${distibuted_flag}
-    fi

    cat <<EOF
    ========================================

--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from ..runtime.collective_runtime import CollectiveRuntime
 from ..runtime.parameter_server_runtime import ParameterServerRuntime
+from ..runtime.the_one_ps import TheOnePSRuntime


 class RuntimeFactory(object):
@@ -26,7 +27,8 @@ class RuntimeFactory(object):
            return collective_runtime

        k_steps = context["valid_strategy"].a_sync_configs["k_steps"]
+
        if not context["role_maker"]._is_collective and k_steps >= 0:
-            ps_runtime = ParameterServerRuntime()
+            ps_runtime = TheOnePSRuntime()
            ps_runtime._set_basic_info(context)
            return ps_runtime
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -72,7 +72,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
-            _startup = worker.init_from_server_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(_startup,
                                                         compiled_config)

@@ -106,19 +105,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
-            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
-            ):
-                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            # ):
+            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup

    def _build_pserver_programs(self, compiled_config):
-        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
-
        _main = fluid.Program()
        _startup = fluid.Program()

+        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
+
        if not compiled_config.is_geo_mode():
+
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+            is_sgd_adam = False
+
+            main_program = compiled_config.get_origin_main_program()
+            ops = _get_optimize_ops(main_program)
+
+            if len(ops) == 0:
+                return _main, _startup
+
+            for op in ops:
+                if op.type in ["sgd", "adam"]:
+                    is_sgd_adam = True
+                    break
+
+            if is_sgd_adam:
+                return _main, _startup
+
            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_optimizer_pass(_main, compiled_config)
@@ -139,12 +156,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_geo_optimizer_pass(_main, compiled_config)
-            _main = server.large_scale_sparse_pass(_main, _main,
-                                                   compiled_config, False)
            _startup = server.build_pserver_startup_program_pass(
                _startup, _main, compiled_config)
-            _startup = server.large_scale_sparse_pass(_startup, _main,
-                                                      compiled_config, True)
            _startup = server.delete_unused_in_startup_pass(_startup, _main,
                                                            compiled_config)


--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -17,10 +17,10 @@ import paddle.fluid as fluid
 import math
 import numpy as np
 from paddle.fluid.framework import Variable
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+import paddle.distributed.fleet as fleet


-def sum(input, scope=None):
+def sum(input, scope=None, util=None):
    """
    distributed sum in fleet

@@ -45,21 +45,22 @@ def sum(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("sum array: ", paddle.distributed.fleet.sum(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="sum")
+    output = util.all_reduce(input, "sum")
    output = output.reshape(old_shape)
    return output


-def max(input, scope=None):
+def max(input, scope=None, util=None):
    """
    distributed max in fleet

@@ -84,21 +85,22 @@ def max(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("max array: ", paddle.distributed.fleet.max(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="max")
+    output = util.all_reduce(input, "max")
    output = output.reshape(old_shape)
    return output


-def min(input, scope=None):
+def min(input, scope=None, util=None):
    """
    distributed min in fleet

@@ -123,21 +125,22 @@ def min(input, scope=None):
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("min array: ", paddle.distributed.fleet.min(res))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
    if isinstance(input, Variable):
        input = np.array(scope.find_var(input.name).get_tensor())
    elif isinstance(input, str):
        input = np.array(scope.find_var(input).get_tensor())
    old_shape = np.array(input.shape)
    output = np.copy(input) * 0
-    fleet._role_maker._all_reduce(input, output, mode="min")
+    output = util.all_reduce(input, "min")
    output = output.reshape(old_shape)
    return output


-def auc(stat_pos, stat_neg, scope=None):
+def auc(stat_pos, stat_neg, scope=None, util=None):
    """
    distributed auc in fleet

@@ -164,9 +167,11 @@ def auc(stat_pos, stat_neg, scope=None):
          neg = np.array(scope.find_var(stat_neg.name).get_tensor())
          print("auc: ", paddle.distributed.fleet.auc(pos, neg))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(stat_pos, Variable):
        stat_pos = np.array(scope.find_var(stat_pos.name).get_tensor())
    elif isinstance(stat_pos, str):
@@ -181,15 +186,14 @@ def auc(stat_pos, stat_neg, scope=None):
    stat_pos = stat_pos.reshape(-1)
    global_pos = np.copy(stat_pos) * 0
    # mpi allreduce
-    fleet._role_maker._all_reduce(stat_pos, global_pos)
-    # reshape to its original shape
+    global_pos = util.all_reduce(stat_pos, "sum")
    global_pos = global_pos.reshape(old_pos_shape)

    # auc neg bucket
    old_neg_shape = np.array(stat_neg.shape)
    stat_neg = stat_neg.reshape(-1)
    global_neg = np.copy(stat_neg) * 0
-    fleet._role_maker._all_reduce(stat_neg, global_neg)
+    global_neg = util.all_reduce(stat_neg, "sum")
    global_neg = global_neg.reshape(old_neg_shape)

    # calculate auc
@@ -216,11 +220,10 @@ def auc(stat_pos, stat_neg, scope=None):
    else:
        auc_value = area / (pos * neg)

-    fleet._role_maker._barrier_worker()
    return auc_value


-def mae(abserr, total_ins_num, scope=None):
+def mae(abserr, total_ins_num, scope=None, util=None):
    """
    distributed mae in fleet

@@ -242,23 +245,28 @@ def mae(abserr, total_ins_num, scope=None):
          res = np.array(scope.find_var(abserr.name).get_tensor())
          print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(abserr, Variable):
        abserr = np.array(scope.find_var(abserr.name).get_tensor())
    elif isinstance(abserr, str):
        abserr = np.array(scope.find_var(abserr).get_tensor())
+
    old_metric_shape = np.array(abserr.shape)
    abserr = abserr.reshape(-1)
    global_metric = np.copy(abserr) * 0
-    fleet._role_maker._all_reduce(abserr, global_metric)
+
+    global_metric = util.all_reduce(abserr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    mae_value = global_metric[0] / total_ins_num
    return mae_value


-def rmse(sqrerr, total_ins_num, scope=None):
+def rmse(sqrerr, total_ins_num, scope=None, util=None):
    """
    distributed rmse in fleet

@@ -280,9 +288,11 @@ def rmse(sqrerr, total_ins_num, scope=None):
          res = np.array(scope.find_var(sqrerr.name).get_tensor())
          print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(sqrerr, Variable):
        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
    elif isinstance(sqrerr, str):
@@ -290,13 +300,15 @@ def rmse(sqrerr, total_ins_num, scope=None):
    old_metric_shape = np.array(sqrerr.shape)
    sqrerr = sqrerr.reshape(-1)
    global_metric = np.copy(sqrerr) * 0
-    fleet._role_maker._all_reduce(sqrerr, global_metric)
+
+    global_metric = util.all_reduce(sqrerr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    rmse_value = math.sqrt(global_metric[0] / total_ins_num)
    return rmse_value


-def mse(sqrerr, total_ins_num, scope=None):
+def mse(sqrerr, total_ins_num, scope=None, util=None):
    """
    distributed mse in fleet

@@ -318,9 +330,11 @@ def mse(sqrerr, total_ins_num, scope=None):
          metric = np.array(scope.find_var(sqrerr.name).get_tensor())
          print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(sqrerr, Variable):
        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
    elif isinstance(sqrerr, str):
@@ -328,13 +342,15 @@ def mse(sqrerr, total_ins_num, scope=None):
    old_metric_shape = np.array(sqrerr.shape)
    sqrerr = sqrerr.reshape(-1)
    global_metric = np.copy(sqrerr) * 0
-    fleet._role_maker._all_reduce(sqrerr, global_metric)
+
+    global_metric = util.all_reduce(sqrerr, "sum")
    global_metric = global_metric.reshape(old_metric_shape)
+
    mse_value = global_metric[0] / total_ins_num
    return mse_value


-def acc(correct, total, scope=None):
+def acc(correct, total, scope=None, util=None):
    """
    distributed accuracy in fleet

@@ -367,9 +383,11 @@ def acc(correct, total, scope=None):
          total_num = np.array(scope.find_var(total.name).get_tensor())
          print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
    """
-    fleet._role_maker._barrier_worker()
    if scope is None:
        scope = fluid.global_scope()
+    if util is None:
+        util = fleet.util
+
    if isinstance(correct, Variable):
        correct = np.array(scope.find_var(correct.name).get_tensor())
    elif isinstance(correct, str):
@@ -378,8 +396,11 @@ def acc(correct, total, scope=None):
        total = np.array(scope.find_var(total.name).get_tensor())
    elif isinstance(total, str):
        total = np.array(scope.find_var(total).get_tensor())
+
    global_correct_num = np.copy(correct) * 0
    global_total_num = np.copy(total) * 0
-    fleet._role_maker._all_reduce(correct, global_correct_num)
-    fleet._role_maker._all_reduce(total, global_total_num)
+
+    global_correct_num = util.all_reduce(correct, "sum")
+    global_total_num = util.all_reduce(total, "sum")
+
    return float(global_correct_num[0]) / float(global_total_num[0])
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -14,3 +14,4 @@

 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
+from .the_one_ps import TheOnePSRuntime
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import os
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.framework import Program
+from paddle.fluid.compiler import CompiledProgram
+from paddle.fluid.executor import Executor
+from paddle.fluid.parallel_executor import ParallelExecutor
+from paddle.fluid.framework import Variable, Parameter
+from .runtime_base import RuntimeBase
+from ..base.private_helper_function import wait_server_ready
+
+
+def conv_indent(indent):
+    return "".join([" "] * indent)
+
+
+class Accessor:
+    def __init__(self):
+        self.accessor_class = ""
+        self.optimizer = None
+        self.feature_dim = -1
+        self.embedding_dim = -1
+        self.optimizer = None
+
+    def to_string(self, indent):
+        accessor_str = "{}accessor {{{}\n{}}}"
+        attrs = ""
+        attrs += "accessor_class: \"{}\" ".format(self.accessor_class)
+        attrs += "fea_dim: {} ".format(self.feature_dim)
+        attrs += "embedx_dim: {} ".format(self.embedding_dim)
+        attrs += "\n"
+        if self.optimizer is not None:
+            attrs += self.optimizer.to_string(indent)
+        return accessor_str.format(
+            conv_indent(indent), attrs, conv_indent(indent))
+
+
+class CommonAccessor:
+    def __init__(self):
+        self.accessor_class = ""
+        self.table_name = None
+        self.attrs = []
+        self.params = []
+        self.dims = []
+        self.trainer_num = 0
+        self.sync = "false"
+        self.initializers = []
+        self.opt_input_map = {}
+        self.opt_attr_map = {}
+        self.opt_init_map = {}
+        self.define_optimize_map()
+
+    def define_optimize_map(self):
+        opt_input_map = {}
+        opt_input_map["sgd"] = [("Param", None), ("LearningRate", 1)]
+        opt_input_map["adam"] = [("Param", None), ("Moment1", None),
+                                 ("Moment2", None), ("Beta1Pow", 1),
+                                 ("Beta2Pow", 1), ("LearningRate", 1)]
+        opt_input_map["sum"] = [("Param", None)]
+
+        opt_attr_map = {}
+        opt_attr_map["sgd"] = []
+        opt_attr_map["sum"] = []
+        opt_attr_map["adam"] = [("beta1", "f"), ("beta2", "f"),
+                                ("epsilon", "f")]
+
+        opt_init_map = {}
+        opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
+        opt_init_map["fill_constant"] = ["value"]
+        opt_init_map["uniform_random"] = ["seed", "min", "max"]
+        opt_init_map["truncated_gaussian_random"] = ["seed", "mean", "std"]
+
+        self.opt_attr_map = opt_attr_map
+        self.opt_input_map = opt_input_map
+        self.opt_init_map = opt_init_map
+
+    def get_shard(self, total_dim, shard_num, pserver_id):
+        # remainder = total_dim % shard_num
+        blocksize = int(total_dim / shard_num + 1)
+
+        if blocksize * (pserver_id + 1) <= total_dim:
+            return blocksize
+        else:
+            if blocksize * pserver_id < total_dim:
+                return total_dim - blocksize * pserver_id
+            else:
+                return 0
+
+    def get_initializer_attr(self, value_name, o_startup_program):
+        l_in = "&"
+        attr_str = ""
+
+        origin_var_name = value_name
+        for op in o_startup_program.global_block().ops:
+            if op.type in self.opt_init_map.keys(
+            ) and origin_var_name == op.output("Out")[0]:
+                init_attr = [op.type]
+                for attr in self.opt_init_map[op.type]:
+                    init_attr.append(str(op.attr(attr)))
+                attr_str = l_in.join(init_attr)
+                break
+        return attr_str
+
+    def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
+                           compiled_strategy):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
+        param_name = compiled_strategy.grad_name_to_param_name[grad_name]
+        main_program, startup_program = compiled_strategy.get_origin_programs()
+        pserver_id = compiled_strategy.get_role_id()
+        pserver_num = len(compiled_strategy.get_ps_endpoints())
+        optimizer_ops = _get_optimize_ops(main_program)
+        oop = None
+
+        for op in optimizer_ops:
+            if op.input("Param")[0] == param_name:
+                oop = op
+                break
+
+        if oop is None:
+            raise ValueError("can not find optimizer for {}".format(grad_name))
+
+        params = []
+        dims = []
+        attrs = []
+        initializers = []
+
+        self.trainer_num = compiled_strategy.get_trainers()
+
+        if compiled_strategy.is_geo_mode():
+            param_varnames = self.opt_input_map["sum"]
+            attr_varnames = self.opt_attr_map["sum"]
+            self.accessor_class = "sum"
+        else:
+            param_varnames = self.opt_input_map[oop.type]
+            attr_varnames = self.opt_attr_map[oop.type]
+            self.accessor_class = oop.type
+
+        for (formal_name, shape) in param_varnames:
+            params.append(formal_name)
+            param = main_program.global_block().vars[oop.input(formal_name)[0]]
+            if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                warnings.warn("will support decay soon")
+                param = main_program.global_block().vars["learning_rate_0"]
+
+            if shape is None:
+                if is_sparse:
+                    shape = total_dims
+                else:
+                    shape = self.get_shard(total_dims, pserver_num, pserver_id)
+            dims.append(shape)
+
+            if formal_name == "Param":
+                initializer = "uniform_random&0&-1.0&1.0"
+            else:
+                initializer = self.get_initializer_attr(param.name,
+                                                        startup_program)
+            initializers.append(initializer)
+
+        for (attr_varname, type_) in attr_varnames:
+            value = oop.attr(attr_varname)
+            attrs.append("&".join([attr_varname, type_, str(value)]))
+
+        self.params = params
+        self.dims = dims
+        self.initializers = initializers
+        self.attrs = attrs
+
+    def to_string(self, indent):
+        accessor_str = "{}common {{{}\n{}}}"
+        attrs = ""
+        attrs += "name: \"{}\" ".format(self.accessor_class)
+
+        if self.table_name:
+            attrs += "table_name: \"{}\" ".format(self.table_name)
+
+        attrs += "trainer_num: {} ".format(self.trainer_num)
+        attrs += "sync: {} ".format(self.sync)
+
+        for param in self.params:
+            attrs += "params: \"{}\" ".format(param)
+
+        for dim in self.dims:
+            attrs += "dims: {} ".format(dim)
+
+        for initializer in self.initializers:
+            attrs += "initializers: \"{}\" ".format(initializer)
+
+        attrs += "\n"
+        return accessor_str.format(
+            conv_indent(indent), attrs, conv_indent(indent))
+
+
+class Table:
+    def __init__(self):
+        self.id = -1
+        self.table_class = None
+        self.shard_num = -1
+        self.type = None
+        self.accessor = None
+        self.common = None
+
+    def to_string(self, indent):
+        table_str = "{}downpour_table_param {{{}\n{}}}"
+
+        attrs = ""
+        attrs += "table_id: {} ".format(self.id)
+        attrs += "table_class: \"{}\" ".format(self.table_class)
+        attrs += "shard_num: {} ".format(self.shard_num)
+        attrs += "type: {}".format(self.type)
+        attrs += "\n"
+        indent += 2
+
+        if self.accessor is not None:
+            attrs += self.accessor.to_string(indent)
+            attrs += "\n"
+
+        if self.common is not None:
+            attrs += self.common.to_string(indent)
+            attrs += "\n"
+
+        return table_str.format(conv_indent(indent), attrs, conv_indent(indent))
+
+
+class Service:
+    def __init__(self):
+        self.server_class = "BrpcPsServer"
+        self.client_class = "BrpcPsClient"
+        self.service_class = "PsService"
+        self.start_server_port = 0
+        self.server_thread_num = 12
+
+    def to_string(self, indent):
+        service_str = "{}service_param {{{}\n{}}}"
+
+        attrs = ""
+        attrs += "server_class: \"{}\" ".format(self.server_class)
+        attrs += "client_class: \"{}\" ".format(self.client_class)
+        attrs += "service_class: \"{}\" ".format(self.service_class)
+        attrs += "start_server_port: {} ".format(self.start_server_port)
+        attrs += "server_thread_num: {} ".format(self.server_thread_num)
+
+        return service_str.format(
+            conv_indent(indent), attrs, conv_indent(indent))
+
+
+class DownpourServer:
+    def __init__(self):
+        self.service = None
+        self.tables = []
+
+    def set_service_param(self, service):
+        self.service = service
+
+    def append_tables(self, table):
+        if not isinstance(table, Table):
+            raise ValueError("only support instance Table")
+        self.tables.append(table)
+
+    def to_string(self, indent):
+        server_str = "{}downpour_server_param {{{}\n{}}}"
+
+        table_strs = ""
+        indent += 2
+
+        table_strs += "\n"
+        table_strs += self.service.to_string(indent)
+
+        for table in self.tables:
+            table_strs += "\n"
+            table_strs += table.to_string(indent)
+        return server_str.format(
+            conv_indent(indent), table_strs, conv_indent(indent))
+
+
+class Server:
+    def __init__(self):
+        self.servers = []
+
+    def add_server(self, server):
+        if not isinstance(server, DownpourServer):
+            raise ValueError("only support instance DownpourServer")
+        self.servers.append(server)
+
+    def __str__(self):
+        server_str = "server_param {{{}\n}}"
+        indent = 2
+        servers_str = ""
+        for server in self.servers:
+            servers_str += "\n"
+            servers_str += server.to_string(indent)
+
+        return server_str.format(servers_str)
+
+
+class DownpourWorker:
+    def __init__(self):
+        self.tables = []
+
+    def append_tables(self, table):
+        if not isinstance(table, Table):
+            raise ValueError("only support instance Table")
+        self.tables.append(table)
+
+    def to_string(self, indent):
+        worker_str = "{}downpour_worker_param {{{}\n{}}}"
+        table_strs = ""
+        indent += 2
+        for table in self.tables:
+            table_strs += "\n"
+            table_strs += table.to_string(indent)
+
+        return worker_str.format(
+            conv_indent(indent), table_strs, conv_indent(indent))
+
+
+class Worker:
+    def __init__(self):
+        self.workers = []
+
+    def add_worker(self, worker):
+        if not isinstance(worker, DownpourWorker):
+            raise ValueError("only support instance DownpourWorker")
+        self.workers.append(worker)
+
+    def __str__(self):
+        worker_str = "worker_param {{{}\n}}"
+        indent = 2
+        workers_str = ""
+        for worker in self.workers:
+            workers_str += "\n"
+            workers_str += worker.to_string(indent)
+
+        return worker_str.format(workers_str)
+
+
+class TheOnePSRuntime(RuntimeBase):
+    def __init__(self):
+        super(TheOnePSRuntime, self).__init__()
+        self._communicator = None
+        self._server = None
+        self._worker = fluid.core.DistFleetWrapper()
+        self._heter_client = None
+
+    def _set_basic_info(self, context):
+        self.context = context
+        self.role_maker = context["role_maker"]
+        self.origin_main_program = context["origin_main_program"]
+        self.origin_startup_program = context["origin_startup_program"]
+        self.async_strategy = self._get_distributed_strategy()
+        self.compiled_strategy = self.build_compiled_startegy()
+
+    def _get_distributed_strategy(self):
+        strategy = None
+
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
+            StrategyFactory
+
+        dist_strategy = self.context["valid_strategy"]
+        k_steps = dist_strategy.a_sync_configs["k_steps"]
+
+        if not dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_sync_strategy()
+
+        if dist_strategy.a_sync and k_steps == 0:
+            strategy = StrategyFactory.create_async_strategy()
+
+        if dist_strategy.a_sync and k_steps > 0:
+            strategy = StrategyFactory.create_geo_strategy(k_steps)
+
+        if not strategy:
+            raise ValueError("k_steps must be invalid value, please check")
+
+        return strategy
+
+    def build_compiled_startegy(self):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy
+
+        compiled_config = CompileTimeStrategy(
+            self.origin_main_program, self.origin_main_program,
+            self.async_strategy, self.role_maker)
+        return compiled_config
+
+    def _init_worker(self):
+        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
+            SyncStrategy, GeoStrategy
+
+        is_sync = self.compiled_strategy.is_sync_mode()
+        worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
+        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
+
+        def sync_strategy_envs():
+            kwargs = {}
+            kwargs[
+                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
+            kwargs["trainer_id"] = self.role_maker._worker_index()
+            return kwargs
+
+        proto_txt = str(worker) + "\n" + str(server)
+
+        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
+
+        if debug:
+            print("worker: \n{}".format(proto_txt))
+
+        endpoints = self.compiled_strategy.get_ps_endpoints()
+
+        string_hosts = []
+        for idx, ep in enumerate(endpoints):
+            host, port = ep.split(":")
+            pshost = fluid.core.PSHost(host, int(port), idx)
+            string_hosts.append(pshost.serialize_to_string())
+
+        dense_map = self.compiled_strategy.get_the_one_recv_context(
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode)
+        send_ctx = self.compiled_strategy.get_the_one_send_context(
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
+            ep_list=endpoints)
+        trainer_config = self.async_strategy.get_trainer_runtime_config()
+
+        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
+
+        if debug:
+            print("worker: \n{}".format(proto_txt))
+            print("communicator send_ctx:")
+            for key in send_ctx:
+                print("{}: {}".format(key, send_ctx[key]))
+            for key in dense_map:
+                print("{}: {}".format(key, dense_map[key]))
+
+        kwargs = {}
+        kwargs['need_global_step'] = "0"
+        kwargs["trainer_id"] = self.role_maker._role_id()
+        kwargs["trainers"] = self.role_maker._worker_num()
+        if self.role_maker._is_heter_worker():
+            kwargs["trainer_id"] += kwargs["trainers"]
+
+        for table in server.servers[0].tables:
+            if table.table_class == "BarrierTable":
+                kwargs["barrier_table_id"] = table.id
+                break
+
+        if isinstance(self.async_strategy, SyncStrategy):
+            sync_kwargs = sync_strategy_envs()
+            kwargs.update(sync_kwargs)
+
+        from paddle.fluid.communicator import Communicator, HeterClient
+        self._communicator = Communicator(
+            trainer_config.mode, kwargs,
+            trainer_config.get_communicator_flags())
+        self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
+                                         string_hosts, fluid.global_scope())
+
+        dist_strategy = self.context["valid_strategy"]
+
+        is_test = bool(int(os.getenv("TEST_MODE", "0")))
+
+        if self.role_maker._is_first_worker(
+        ) and self.role_maker._is_heter_parameter_server_mode:
+            # for ps-heter mode load all parameters on first_worker
+            init_params = self.compiled_strategy.get_the_one_recv_context(
+                split_dense_table=True, use_origin_program=True)
+        else:
+            init_params = dense_map
+
+        if not is_test:
+            self._communicator.init_params(init_params)
+
+        if not self._communicator.is_running():
+            self._communicator.start()
+        else:
+            warnings.warn("communicator has been initialized, skip")
+
+        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
+        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
+        if launch_barrier and launch_barrier_flag:
+            # for trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
+
+            # for ps-heter mode, wait heter worker ready
+            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            ):
+                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+
+                self._heter_client = HeterClient(
+                    self.role_maker._get_heter_worker_endpoints(),
+                    self.role_maker._role_id())
+
+    def _push_sparse_param(self,
+                           var_name,
+                           table_id=-1,
+                           scope=fluid.global_scope()):
+        self._communicator.push_sparse_param(var_name, table_id, scope)
+
+    def _get_executor(self):
+        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_parameter_server_mode:
+            heter_worker_device_guard = self.context[
+                "valid_strategy"].a_sync_configs[
+                    "heter_worker_device_guard"].upper()
+            if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
+                raise ValueError("Heter Worker Not Support Device {}".format(
+                    heter_worker_device_guard))
+            if self.role_maker._is_heter_worker():
+                if heter_worker_device_guard == "GPU":
+                    executor = Executor(
+                        fluid.CUDAPlace(
+                            int(os.getenv("FLAGS_selected_gpus", "0"))))
+                elif heter_worker_device_guard == "XPU":
+                    executor = Executor(
+                        fluid.XPUPlace(
+                            int(os.getenv("FLAGS_selected_xpus", "0"))))
+        return executor
+
+    def _get_fleet_proto(self, is_server, is_sync):
+        def _build_merge_accessor(ctx):
+            accessor = Accessor()
+            accessor.accessor_class = "CommMergeAccessor"
+            accessor.optimizer = None
+
+            if ctx.is_sparse():
+                accessor.feature_dim = ctx.sections()[0]
+                accessor.embedding_dim = ctx.sections()[1]
+            else:
+                accessor.feature_dim = ctx.sections()[0]
+                accessor.embedding_dim = 1
+
+            return accessor
+
+        def _build_barrier_table(idx):
+            table = Table()
+            table.id = idx
+            table.type = "PS_OTHER_TABLE"
+            table.table_class = "BarrierTable"
+            table.shard_num = 256
+
+            accessor = Accessor()
+            accessor.accessor_class = "CommMergeAccessor"
+            accessor.optimizer = None
+            accessor.feature_dim = 0
+            accessor.embedding_dim = 0
+            table.accessor = accessor
+
+            common = CommonAccessor()
+            common.table_name = "barrier_table"
+            trainer_num = self.compiled_strategy.get_trainers()
+            if self.role_maker._is_heter_parameter_server_mode:
+                trainer_num += len(self.role_maker._get_heter_worker_endpoints(
+                ))
+            common.trainer_num = trainer_num
+            common.attrs = ""
+            common.dims = []
+            common.params = []
+            table.common = common
+            return table
+
+        def _get_tables():
+            send_ctx = self.compiled_strategy.get_the_one_send_context(
+                use_origin_program=True,
+                split_dense_table=self.role_maker.
+                _is_heter_parameter_server_mode)
+            tables = [i for i in range(len(send_ctx) + 1)]
+
+            for idx, (name, ctx) in enumerate(send_ctx.items()):
+                table = Table()
+                table.id = ctx.table_id()
+
+                if ctx.is_sparse():
+                    if len(ctx.origin_varnames()) < 1:
+                        continue
+                    table.type = "PS_SPARSE_TABLE"
+
+                    if self.compiled_strategy.is_geo_mode():
+                        table.table_class = "SparseGeoTable"
+                    else:
+                        table.table_class = "CommonSparseTable"
+                    table.shard_num = 256
+                else:
+                    if len(ctx.origin_varnames()) < 1:
+                        continue
+                    table.type = "PS_DENSE_TABLE"
+                    table.table_class = "CommonDenseTable"
+                    table.shard_num = 256
+
+                common = CommonAccessor()
+                if ctx.is_sparse():
+                    common.table_name = self.compiled_strategy.grad_name_to_param_name[
+                        ctx.origin_varnames()[0]]
+                else:
+                    common.table_name = "MergedDense"
+
+                common.parse_by_optimizer(ctx.origin_varnames()[0],
+                                          ctx.is_sparse(),
+                                          ctx.sections()[1] if ctx.is_sparse()
+                                          else ctx.sections()[0],
+                                          self.compiled_strategy)
+
+                if is_sync:
+                    common.sync = "true"
+                else:
+                    common.sync = "false"
+
+                table.common = common
+
+                accessor = _build_merge_accessor(ctx)
+                table.accessor = accessor
+                tables[table.id] = table
+
+            barrier_table = _build_barrier_table(len(send_ctx))
+            tables[-1] = barrier_table
+            return tables
+
+        if is_server:
+            server = Server()
+            downpour_server = DownpourServer()
+
+            service = Service()
+            downpour_server.set_service_param(service)
+
+            tables = _get_tables()
+            downpour_server.tables = tables
+            server.add_server(downpour_server)
+            return server
+        else:
+            worker = Worker()
+            downpour_worker = DownpourWorker()
+
+            tables = _get_tables()
+            downpour_worker.tables = tables
+            worker.add_worker(downpour_worker)
+            return worker
+
+    def _init_server(self, dirname=None, var_names=None, **kwargs):
+        if self.role_maker._is_heter_worker():
+            self._init_heter_worker()
+            return
+        role_id = self.compiled_strategy.get_role_id()
+        endpoints = self.compiled_strategy.get_ps_endpoints()
+        is_sync = self.compiled_strategy.is_sync_mode()
+
+        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
+        proto_txt = str(server)
+
+        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
+        if debug:
+            print("server: \n{}".format(proto_txt))
+
+        string_hosts = []
+        for idx, ep in enumerate(endpoints):
+            host, port = ep.split(":")
+            pshost = fluid.core.PSHost(host, int(port), idx)
+            string_hosts.append(pshost.serialize_to_string())
+
+        self._server = fluid.core.DistFleetWrapper()
+        self._server.init_server(proto_txt, string_hosts, role_id)
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
+
+        dist_varnames = get_sparse_tablenames(self.origin_main_program, True)
+        sparse_varnames = get_sparse_tablenames(self.origin_main_program, False)
+
+        distributed_varnames = dist_varnames + sparse_varnames
+
+        if var_names is None:
+            load_varnames = distributed_varnames
+        else:
+            for var_name in var_names:
+                if var_name not in distributed_varnames:
+                    raise ValueError(
+                        "fleet.init server can only load sparse variables in {}".
+                        format(distributed_varnames))
+            load_varnames = var_names
+
+        if dirname is None or not load_varnames:
+            return
+
+        sparse_table_maps = {}
+        for table in server.servers[0].tables:
+            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
+                sparse_table_maps[table.common.table_name] = table.id
+
+        dirname = os.path.normpath(dirname)
+        pserver_id = self.role_maker._role_id()
+
+        import time
+        begin = time.time()
+        for var_name in load_varnames:
+            table_id = sparse_table_maps[var_name]
+            path = os.path.join(dirname, var_name,
+                                "{}.block{}.txt".format(var_name, pserver_id))
+            meta = os.path.join(dirname, var_name,
+                                "{}.block{}.meta".format(var_name, pserver_id))
+            self._server.load_sparse(path, meta, table_id)
+        end = time.time()
+        print("init sparse variables: {} cost time: {}".format(load_varnames,
+                                                               end - begin))
+
+    def _run_server(self):
+        if self.role_maker._is_heter_worker():
+            self._run_heter_worker()
+            return
+
+        ep = self.compiled_strategy.get_ps_endpoint()
+        host, port = ep.split(":")
+        self._server.run_server(host, int(port))
+
+    def _init_heter_worker(self):
+        executor = self._get_executor()
+        executor.run(fluid.default_startup_program())
+        self._init_worker()
+
+    def _run_heter_worker(self):
+        executor = self._get_executor()
+        executor.run(fluid.default_main_program())
+
+    def _stop_worker(self):
+        self._communicator.stop()
+        if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+        ):
+            self._heter_client.stop()
+        executor = self._get_executor()
+        executor.close()
+
+    @staticmethod
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+
+            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+
+            origin_varname, _, _ = _get_varname_parts(var.name)
+            if origin_varname.endswith("@GRAD"):
+                return False
+
+            if origin_varname == "learning_rate_0":
+                return False
+
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    def _save_sparse_params(self, executor, dirname, context, main_program):
+        values = []
+        for id, names in context.items():
+            values.extend(names)
+            self._worker.save_one_model(id, dirname, 0)
+        return values
+
+    def _save_distributed_persistables(self, executor, dirname, main_program,
+                                       mode):
+
+        denses = self.compiled_strategy.get_the_one_recv_context(
+            is_dense=True,
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
+            use_origin_program=True)
+        sparses = self.compiled_strategy.get_the_one_recv_context(
+            is_dense=False,
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
+            use_origin_program=True)
+
+        recv_sparse_varnames = self._save_sparse_params(executor, dirname,
+                                                        sparses, main_program)
+
+        recv_dense_varnames = []
+        for id, names in denses.items():
+            recv_dense_varnames.extend(names)
+
+        saved_varnames = recv_sparse_varnames
+
+        remaining_vars = list(
+            filter(
+                TheOnePSRuntime.__exclude_vars(saved_varnames),
+                main_program.list_vars()))
+
+        fluid.io.save_vars(
+            executor,
+            main_program=main_program,
+            dirname=dirname,
+            vars=remaining_vars)
+
+    def _ps_inference_save_persistables(self,
+                                        executor,
+                                        dirname,
+                                        main_program=None,
+                                        mode=0,
+                                        **kwargs):
+        """
+        This function filters out all variables with `persistable==True` from the
+        give `main_program` and then saves these variables to the folder `dirname`
+        or file `filename`.
+
+        The `dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set `filename` None; if you would like to save all variables in a
+        single file, use `filename` to specify the file name.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_persistables() function, executor must be as Executor type"
+            )
+
+        if main_program is None:
+            main_program = self.compiled_strategy.get_origin_ps_main_program()
+
+        if isinstance(main_program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
+            )
+
+        self._save_distributed_persistables(executor, dirname, main_program,
+                                            mode)
+
+    def _ps_inference_save_inference_model(self,
+                                           executor,
+                                           dirname,
+                                           feeded_var_names,
+                                           target_vars,
+                                           main_program=None,
+                                           export_for_deployment=True):
+        """
+        Prune the given `main_program` to build a new program especially for inference,
+        and then save it and all related parameters to given `dirname` by the `executor`.
+        """
+
+        if isinstance(executor, ParallelExecutor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
+            )
+
+        if not isinstance(executor, Executor):
+            raise TypeError(
+                "in fleet.save_inference_model() function, executor must be as Executor type"
+            )
+
+        if main_program is not None:
+            if isinstance(main_program, CompiledProgram):
+                raise TypeError(
+                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
+                )
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor, main_program,
+                                          None, None, export_for_deployment)
+        else:
+            fluid.io.save_inference_model(dirname, feeded_var_names,
+                                          target_vars, executor,
+                                          self.origin_main_program, None, None,
+                                          export_for_deployment, True)
+            model_basename = "__model__"
+            model_filename = os.path.join(dirname, model_basename)
+
+            with open(model_filename, "rb") as f:
+                program_desc_str = f.read()
+
+            program = Program.parse_from_string(program_desc_str)
+            program._copy_dist_param_info_from(fluid.default_main_program())
+            self._ps_inference_save_persistables(
+                executor, dirname, program, mode=0)
+
+    def _save_inference_model(self, *args, **kwargs):
+        self._ps_inference_save_inference_model(*args, **kwargs)
+
+    def _save_persistables(self, *args, **kwargs):
+        self._ps_inference_save_persistables(*args, **kwargs)
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.

 from .fs import LocalFS, HDFSClient
+from .ps_util import Distributed
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parameter Server utils"""
+
+import numpy as np
+
+
+class Distributed:
+    @staticmethod
+    def estimate(main_program, varname2tables):
+        def distributed_ops_pass(program):
+            SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
+
+            def _get_pull_sparse_ops(_program):
+                pull_sparse_ops = {}
+                for op in _program.global_block().ops:
+                    if op.type in SPARSE_OP_TYPE_DICT.keys() \
+                            and op.attr('remote_prefetch') is True:
+                        param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
+                        ops = pull_sparse_ops.get(param_name, [])
+                        ops.append(op)
+                        pull_sparse_ops[param_name] = ops
+                return pull_sparse_ops
+
+            def _pull_sparse_fuse(_program, pull_sparse_ops):
+                for param, ops in pull_sparse_ops.items():
+                    all_ops = program.global_block().ops
+                    op_idxs = [all_ops.index(op) for op in ops]
+
+                    inputs = [
+                        program.global_block().vars[op.input("Ids")[0]]
+                        for op in ops
+                    ]
+
+                    w = program.global_block().vars[ops[0].input("W")[0]]
+
+                    if w.name not in varname2tables.keys():
+                        raise ValueError(
+                            "can not find variable {}, please check your configuration".
+                            format(w.name))
+
+                    table_id = varname2tables[w.name]
+
+                    padding_idx = ops[0].attr("padding_idx")
+                    is_distributed = ops[0].attr("is_distributed")
+                    op_type = ops[0].type
+
+                    outputs = [
+                        program.global_block().vars[op.output("Out")[0]]
+                        for op in ops
+                    ]
+
+                    for idx in op_idxs[::-1]:
+                        program.global_block()._remove_op(idx)
+
+                    inputs_idxs = [-1] * len(inputs)
+                    outputs_idxs = [-1] * len(outputs)
+
+                    for idx, op in enumerate(program.global_block().ops):
+                        for i in range(0, len(op.output_names)):
+                            outs = op.output(op.output_names[i])
+                            for in_id, in_var in enumerate(inputs):
+                                if in_var.name in outs:
+                                    inputs_idxs[in_id] = idx
+                        for i in range(0, len(op.input_names)):
+                            ins = op.input(op.input_names[i])
+                            for out_id, out_var in enumerate(outputs):
+                                if out_var.name in ins:
+                                    outputs_idxs[out_id] = idx
+
+                    if min(outputs_idxs) - max(inputs_idxs) >= 1:
+                        distributed_idx = max(inputs_idxs) + 1
+
+                        program.global_block()._insert_op(
+                            index=distributed_idx,
+                            type="distributed_lookup_table",
+                            inputs={"Ids": inputs,
+                                    'W': w},
+                            outputs={"Outputs": outputs},
+                            attrs={
+                                "is_distributed": is_distributed,
+                                "padding_idx": padding_idx,
+                                "table_id": table_id,
+                                "lookup_table_version": op_type
+                            })
+                    else:
+                        raise ValueError(
+                            "something wrong with Fleet, submit a issue is recommended"
+                        )
+
+            pull_sparse_ops = _get_pull_sparse_ops(program)
+            _pull_sparse_fuse(program, pull_sparse_ops)
+            return program
+
+        covert_program = distributed_ops_pass(main_program)
+        return covert_program
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -216,25 +216,6 @@ def __bootstrap__():
        read_env_flags.append('tracer_mkldnn_ops_on')
        read_env_flags.append('tracer_mkldnn_ops_off')

-    if core.is_compiled_with_dist():
-        #env for rpc
-        read_env_flags.append('rpc_deadline')
-        read_env_flags.append('rpc_retry_times')
-        read_env_flags.append('rpc_server_profile_path')
-        read_env_flags.append('enable_rpc_profiler')
-        read_env_flags.append('rpc_send_thread_num')
-        read_env_flags.append('rpc_get_thread_num')
-        read_env_flags.append('rpc_prefetch_thread_num')
-        read_env_flags.append('rpc_disable_reuse_port')
-        read_env_flags.append('rpc_retry_bind_port')
-
-        read_env_flags.append('worker_update_interval_secs')
-
-        if core.is_compiled_with_brpc():
-            read_env_flags.append('max_body_size')
-            #set brpc max body size
-            os.environ['FLAGS_max_body_size'] = "2147483647"
-
    if core.is_compiled_with_cuda():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use',

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
+from .proto import framework_pb2

 from paddle.fluid import framework as framework
 from . import core
@@ -376,21 +377,29 @@ def _append_grad_suffix_(name):
    return cpt.to_text(name) + core.grad_var_suffix()


-def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
-                                     op_idx):
+def _accumulate_gradients_by_sum_op_(var_name,
+                                     renamed_vars,
+                                     pending_sum_ops,
+                                     op_idx,
+                                     op_device=""):
    """
    Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
    """
    if op_idx not in pending_sum_ops.keys():
        pending_sum_ops[op_idx] = []
    pending_sum_ops[op_idx].append(
-        _create_op_desc_("sum", {"X": renamed_vars[var_name]},
-                         {"Out": [var_name]}, {"use_mkldnn": False}))
+        _create_op_desc_("sum", {"X": renamed_vars[var_name]}, {
+            "Out": [var_name]
+        }, {"use_mkldnn": False,
+            "op_device": op_device}))
    renamed_vars[var_name] = [var_name]


-def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
-                                      op_idx):
+def _accumulate_gradients_by_add_ops_(var_name,
+                                      renamed_vars,
+                                      pending_sum_ops,
+                                      op_idx,
+                                      op_device=""):
    """
    Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
    """
@@ -407,7 +416,8 @@ def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
        pending_sum_ops[op_idx].append(
            _create_op_desc_("grad_add", {"X": [x_name],
                                          "Y": [y_name]}, {"Out": [out_name]},
-                             {"use_mkldnn": False}))
+                             {"use_mkldnn": False,
+                              "op_device": op_device}))
    renamed_vars[var_name] = [var_name]


@@ -425,23 +435,28 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
    renamed_vars = collections.defaultdict(list)
    renamed_var_start_idx = collections.defaultdict(list)
    for idx, op_desc in enumerate(op_descs):
+        op_device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+        )
+        op_device = ""
+        if op_desc.has_attr(op_device_attr_name):
+            op_device = op_desc.attr(op_device_attr_name)
        for var_name in op_desc.input_arg_names():
            if "@GRAD" not in var_name:
                continue
            if len(renamed_vars[var_name]) > 1:
                if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
-                    _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
-                                                     pending_sum_ops, idx)
+                    _accumulate_gradients_by_sum_op_(
+                        var_name, renamed_vars, pending_sum_ops, idx, op_device)
                else:
-                    _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
-                                                      pending_sum_ops, idx)
+                    _accumulate_gradients_by_add_ops_(
+                        var_name, renamed_vars, pending_sum_ops, idx, op_device)

        for param_idx, param_name in enumerate(op_desc.output_names()):
            arg_names = op_desc.output(param_name)
            for arg_idx, var_name in enumerate(arg_names):
                if "@GRAD" not in var_name:
                    continue
-                #if "@RENAME@" in var_name:
+                # if "@RENAME@" in var_name:
                #    continue
                if var_name == core.empty_var_name(
                ) or var_name in op_desc.input_arg_names():
@@ -677,9 +692,6 @@ def _find_not_need_ops(grad_op_descs, forward_ops, input_grad_names_set):
    return not_need_op_descs_set


-from .proto import framework_pb2
-
-
 def serialize_op_decs(op_desc):
    protostr = op_desc.serialize_to_string()
    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
@@ -1710,7 +1722,7 @@ def _find_op_path_(block,
        # TODO(liym27): Consider special types of ops.
        for i, op in reversed(list(enumerate(block.ops))):
            if relevant_op_flags[i] == False \
-                    and _some_in_set_(op.desc.output_arg_names(),output_names):
+                    and _some_in_set_(op.desc.output_arg_names(), output_names):
                relevant_op_flags[i] = True

    op_path = [

--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
@@ -32,7 +32,6 @@ Communicator is used for async distribute training in distribute_transpiler mode
 It's a wrapper of a cpp class Communicator and should be used inside fleet API.
 """
 from . import core
-from paddle.fluid.framework import Program
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode

 __all__ = ['Communicator', 'LargeScaleKV']
@@ -65,13 +64,11 @@ class Communicator(object):

        if mode == DistributedMode.SYNC:
            envs["pserver_endpoints"] = ','.join(kwargs["pserver_endpoints"])
-            envs["trainer_id"] = str(kwargs["trainer_id"])

-        if mode == DistributedMode.GEO:
        envs["trainers"] = str(kwargs["trainers"])
-            envs["sparse_attrs"] = str(kwargs["sparse_attrs"])
-
+        envs["trainer_id"] = str(kwargs["trainer_id"])
        envs["need_global_step"] = str(kwargs["need_global_step"])
+        envs["barrier_table_id"] = str(kwargs["barrier_table_id"])

        mode_str = None

@@ -87,11 +84,20 @@ class Communicator(object):
        self.mode = mode_str
        self.envs = envs
        self.communicator_ = None
+        self.send_ctx_ = None
+        self.recv_ctx_ = None

-    def init_with_ctx(self, send_ctx, recv_ctx):
-        self.communicator_ = core.DistCommunicator(self.mode, send_ctx,
+    def init_with_ctx(self,
+                      send_ctx,
                      recv_ctx,
-                                                   global_scope(), self.envs)
+                      proto_txt,
+                      unit64_hosts,
+                      scope=global_scope()):
+        self.communicator_ = core.DistCommunicator(self.mode, proto_txt,
+                                                   unit64_hosts, send_ctx,
+                                                   recv_ctx, scope, self.envs)
+        self.send_ctx_ = send_ctx
+        self.recv_ctx_ = recv_ctx

    def start(self):
        """
@@ -152,6 +158,20 @@ class Communicator(object):
    def recv(self):
        self.communicator_.recv()

+    def init_params(self, context):
+        self.communicator_.init_params(context)
+
+    def push_sparse_param(self, var_name, table_id=-1, scope=global_scope()):
+        if not self.is_running():
+            raise ValueError(
+                "Communicator should init first. Using fleet.init_worker() before push_sparse_param()"
+            )
+        assert isinstance(var_name, str)
+        assert isinstance(table_id, int)
+        if table_id == -1:
+            table_id = self.send_ctx_[var_name].table_id()
+        self.communicator_.push_sparse_param(var_name, table_id, scope)
+

 class LargeScaleKV(object):
    def __init__(self):
@@ -165,3 +185,11 @@ class LargeScaleKV(object):

    def size(self, varname):
        return self.scale_kv.size(varname)
+
+
+class HeterClient(object):
+    def __init__(self, endpoint, trainer_id):
+        self.heter_client_ = core.HeterClient(endpoint, trainer_id)
+
+    def stop(self):
+        self.heter_client_.stop()
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1365,7 +1365,8 @@ class Variable(object):
        if self.type == core.VarDesc.VarType.SELECTED_ROWS or self.type == core.VarDesc.VarType.LOD_TENSOR:
            dtype_str = str(self.dtype).split('.')[1]
            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".\
-                format(name=self.name, type=type_str, shape=self.shape, dtype=dtype_str, stop_gradient=self.stop_gradient)
+                format(name=self.name, type=type_str, shape=self.shape,
+                       dtype=dtype_str, stop_gradient=self.stop_gradient)
        else:
            var_str = "{name} : {type})".\
                format(name=self.name, type=type_str)
@@ -2013,7 +2014,8 @@ class Operator(object):
        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
-        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue'
+        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue',
+        'heter_listen_and_serv'
    }

    def __init__(self,
@@ -2284,7 +2286,8 @@ class Operator(object):

        if outputs_str != "{}":
            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
-                format(outputs = outputs_str, op_type=self.type, inputs=inputs_str, attrs=attrs_str)
+                format(outputs=outputs_str, op_type=self.type,
+                       inputs=inputs_str, attrs=attrs_str)
        else:
            op_str = "{op_type}(inputs={inputs}, {attrs})".\
                format(op_type=self.type, inputs=inputs_str, attrs=attrs_str)
@@ -3967,8 +3970,9 @@ class IrGraph(object):

        def _convert_to_pdf(dot_file_path):
            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
-                                          + ' -o ' + pdf_save_path, shell=True)
+            exited_code = subprocess.call(
+                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
+                shell=True)
            if exited_code != 0:
                print('The dot command is needed for creating pdf files.')
                print('The {} is saved as the dot filetype.'.format(
@@ -4581,7 +4585,7 @@ class Program(object):
            The two code snippets above will generate and print same programs.
        """

-        #NOTE(zhiqiu): we sync the original program first, since its program may diff with
+        # NOTE(zhiqiu): we sync the original program first, since its program may diff with
        # its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
        self._sync_with_cpp()

@@ -4611,7 +4615,7 @@ class Program(object):
            if hasattr(self, 'lr_sheduler'):
                p.lr_sheduler = self.lr_sheduler

-            #NOTE(zhiqiu): we sync the cloned program, to update its program by
+            # NOTE(zhiqiu): we sync the cloned program, to update its program by
            # its desc.
            p._sync_with_cpp()

@@ -4656,7 +4660,7 @@ class Program(object):
            Program:  A new, pruned program.
        """

-        #NOTE(zhiqiu): we sync the original program first, since its program may diff with
+        # NOTE(zhiqiu): we sync the original program first, since its program may diff with
        # its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
        self._sync_with_cpp()


--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -138,6 +138,13 @@ class CompileTimeStrategy(object):

        self.strategy = strategy
        self.role_maker = role_maker
+        try:
+            self.is_heter_ps_mode = role_maker._is_heter_parameter_server_mode
+        except:
+            warnings.warn(
+                "Using paddle.distributed.fleet instead of paddle.fluid.incubate.fleet"
+            )
+            self.is_heter_ps_mode = False

        self.origin_sparse_pairs = []
        self.origin_dense_pairs = []
@@ -469,7 +476,7 @@ class CompileTimeStrategy(object):
                continue

            ctx = self.build_ctx(params, self.param_var_mapping, False, False,
-                                 False)
+                                 False, False)
            dense_recv_ctx[ctx.var_name()] = ctx

        for pairs in self.origin_sparse_pairs:
@@ -498,6 +505,157 @@ class CompileTimeStrategy(object):
            "recv_type can only be 1/2/3/4, 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL"
        )

+    def get_the_one_trainer_send_context(self, split_dense_table):
+        if self.is_geo_mode():
+            send_ctx = {}
+            trainer_id = self.get_role_id()
+            idx = 0
+
+            distibuted_varnames = get_sparse_tablenames(
+                self.origin_main_program, True)
+            for merged in self.merged_sparse_pairs:
+                param, grad = merged
+                grad_name = grad.merged_var.name
+                param_name = param.merged_var.name
+                is_distributed = True if param_name in distibuted_varnames else False
+
+                var = self.origin_main_program.global_block().vars[
+                    grad.merged_var.name]
+                var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+
+                sparse_ctx = CommContext(
+                    grad_name, [grad_name], ["127.0.0.1:6071"], [var_numel],
+                    [grad_name], trainer_id, True, True, is_distributed, idx)
+                idx += 1
+                send_ctx[sparse_ctx.var_name()] = sparse_ctx
+
+            if len(send_ctx) == 0:
+                raise ValueError(
+                    "GeoSGD require sparse parameters in your net.")
+
+            return send_ctx
+        else:
+            return self.get_the_one_send_context(split_dense_table)
+
+    def get_dense_send_context(self,
+                               send_ctx,
+                               idx,
+                               merged_dense_pairs,
+                               trainer_id,
+                               split_dense_table=False):
+        if len(merged_dense_pairs) < 1:
+            return idx
+        if not split_dense_table:
+            origin_varnames = []
+            var_numel = 0
+            for merged in merged_dense_pairs:
+                grad = merged[1]
+                origin_varnames.append(grad.merged_var.name)
+                var = self.origin_main_program.global_block().vars[
+                    grad.merged_var.name]
+                var_numel += reduce(lambda x, y: x * y, var.shape)
+            grad_name = "Dense@Grad"
+            trainer_id = self.get_role_id()
+            aggregate = True
+            dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
+                                    [var_numel], origin_varnames, trainer_id,
+                                    aggregate, False, False, idx)
+            send_ctx[grad_name] = dense_ctx
+            idx += 1
+        else:
+            for merged in merged_dense_pairs:
+                grad = merged[1]
+                origin_varname = grad.merged_var.name
+                var = self.origin_main_program.global_block().vars[
+                    origin_varname]
+                var_numel = reduce(lambda x, y: x * y, var.shape)
+                grad_name = origin_varname
+                aggregate = True
+                dense_ctx = CommContext(
+                    grad_name, [grad_name], ["127.0.0.1:6071"], [var_numel],
+                    [origin_varname], trainer_id, aggregate, False, False, idx)
+                send_ctx[grad_name] = dense_ctx
+                idx += 1
+        return idx
+
+    def get_the_one_send_context(self,
+                                 split_dense_table=False,
+                                 use_origin_program=False,
+                                 ep_list=None):
+        if ep_list is None:
+            ep_list = ["127.0.0.1:6071"]
+        send_ctx = {}
+        trainer_id = self.get_role_id()
+        idx = 0
+
+        merged_dense_pairs = self.origin_merged_dense_pairs if use_origin_program else self.merged_dense_pairs
+        merged_sparse_pairs = self.origin_merged_sparse_pairs if use_origin_program else self.merged_sparse_pairs
+
+        idx += self.get_dense_send_context(send_ctx, idx, merged_dense_pairs,
+                                           trainer_id, split_dense_table)
+
+        distibuted_varnames = get_sparse_tablenames(self.origin_main_program,
+                                                    True)
+        for merged in merged_sparse_pairs:
+            param, grad = merged
+            grad_name = grad.merged_var.name
+            param_name = param.merged_var.name
+            splited_varname = []
+
+            for i in range(len(ep_list)):
+                splited_varname.append("{}.block{}".format(param_name, i))
+
+            is_distributed = True if param_name in distibuted_varnames else False
+
+            var = self.origin_main_program.global_block().vars[
+                grad.merged_var.name]
+
+            shape = list(var.shape)
+            shape[0] = 0 if is_distributed else shape[0]
+
+            sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
+                                     [grad_name], trainer_id, True, True,
+                                     is_distributed, idx)
+
+            idx += 1
+            send_ctx[sparse_ctx.var_name()] = sparse_ctx
+        return send_ctx
+
+    def get_the_one_recv_context(self,
+                                 is_dense=True,
+                                 split_dense_table=False,
+                                 use_origin_program=False):
+        recv_id_maps = {}
+        if is_dense:
+            send_ctx = self.get_the_one_send_context(
+                split_dense_table=split_dense_table,
+                use_origin_program=use_origin_program)
+            for idx, (name, ctx) in enumerate(send_ctx.items()):
+                if ctx.is_sparse():
+                    continue
+
+                origin_grad_varnames = ctx.origin_varnames()
+
+                param_names = []
+                for grad_varname in origin_grad_varnames:
+                    param_name = self.grad_name_to_param_name[grad_varname]
+                    param_names.append(param_name)
+                recv_id_maps[ctx.table_id()] = param_names
+        else:
+            send_ctx = self.get_the_one_send_context()
+            for idx, (name, ctx) in enumerate(send_ctx.items()):
+                if not ctx.is_sparse():
+                    continue
+
+                origin_grad_varnames = ctx.origin_varnames()
+
+                param_names = []
+                for grad_varname in origin_grad_varnames:
+                    param_name = self.grad_name_to_param_name[grad_varname]
+                    param_names.append(param_name)
+                recv_id_maps[ctx.table_id()] = param_names
+        return recv_id_maps
+
    def get_server_runtime_config(self):
        return self.strategy.get_server_runtime_config()


--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -82,6 +82,8 @@ def delete_optimizer_pass(program, config):

 def distributed_ops_pass(program, config):
    trainer_id = config.get_role_id()
+    send_ctx = config.get_the_one_send_context(
+        split_dense_table=config.is_heter_ps_mode)

    def _get_pull_sparse_ops(_program):
        pull_sparse_ops = {}
@@ -102,6 +104,19 @@ def distributed_ops_pass(program, config):
                program.global_block().vars[op.input("Ids")[0]] for op in ops
            ]
            w = program.global_block().vars[ops[0].input("W")[0]]
+
+            grad_name = config.param_name_to_grad_name[w.name]
+
+            table_id = -1
+
+            for name, ctx in send_ctx.items():
+                if grad_name in ctx.origin_varnames():
+                    table_id = ctx.table_id()
+
+            if table_id == -1:
+                raise ValueError(
+                    "can not find suitable sparse table, please check")
+
            padding_idx = ops[0].attr("padding_idx")
            is_distributed = ops[0].attr("is_distributed")
            op_type = ops[0].type
@@ -128,16 +143,6 @@ def distributed_ops_pass(program, config):
                        if out_var.name in ins:
                            outputs_idxs[out_id] = idx

-            tables = config.get_var_distributed(w.name, True)
-
-            pserver_endpoints = config.get_ps_endpoints()
-
-            tablenames, eps, sections, = [], [], []
-            for table in tables:
-                tablenames.append(table[0])
-                eps.append(table[1])
-                sections.append(table[2])
-
            if min(outputs_idxs) - max(inputs_idxs) >= 1:
                distributed_idx = max(inputs_idxs) + 1

@@ -148,12 +153,9 @@ def distributed_ops_pass(program, config):
                            'W': w},
                    outputs={"Outputs": outputs},
                    attrs={
-                        "table_names": tablenames,
-                        "endpoints": eps,
                        "is_distributed": is_distributed,
-                        "pserver_num": len(pserver_endpoints),
                        "padding_idx": padding_idx,
-                        "trainer_id": trainer_id,
+                        "table_id": table_id,
                        "lookup_table_version": op_type
                    })
            else:
@@ -168,9 +170,8 @@ def distributed_ops_pass(program, config):
 def append_send_ops_pass(program, config):
    mode = config.get_distributed_mode()
    trainer_id = config.get_role_id()
-    pserver_endpoints = config.get_ps_endpoints()

-    def _append_send_op(union_vars, queue):
+    def _append_send_op(union_vars, queue, is_sparse, table_id):

        if queue == STEP_COUNTER:
            send_input_vars = []
@@ -191,9 +192,8 @@ def append_send_ops_pass(program, config):
            outputs={"Out": dummy_output},
            attrs={
                "send_varnames": [queue],
-                "merge_add": True,
-                "use_send_handler": False,
-                "endpoints": pserver_endpoints,
+                "is_sparse": is_sparse,
+                "table_id": table_id,
                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
            })

@@ -205,7 +205,6 @@ def append_send_ops_pass(program, config):
            inputs={"X": dummys},
            outputs={"Out": []},
            attrs={
-                "endpoints": pserver_endpoints,
                "trainer_id": trainer_id,
                "half_async": True,
                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -213,10 +212,15 @@ def append_send_ops_pass(program, config):

    dummys = []

-    sends = config.get_trainer_send_context()
+    sends = config.get_the_one_trainer_send_context(
+        split_dense_table=config.is_heter_ps_mode)

    for merged_name, send in sends.items():
-        dummys.append(_append_send_op(send.origin_varnames(), merged_name))
+        is_sparse = 1 if send.is_sparse() else 0
+        is_sparse = 2 if send.is_distributed() else is_sparse
+        dummys.append(
+            _append_send_op(send.origin_varnames(), merged_name, is_sparse,
+                            send.table_id()))

    if mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
        _append_barrier_op(dummys)
@@ -225,6 +229,10 @@ def append_send_ops_pass(program, config):


 def init_from_server_pass(program, config):
+    # 0' trainer do not need barrier, it will call barrier at the end init_worker
+    if config.role_maker._is_first_worker():
+        return program
+
    fetch_barrier_out = program.global_block().create_var(
        name=framework.generate_control_dev_var_name())

@@ -468,55 +476,6 @@ def create_heter_program(program, config, heter_program, heter_ops,

        first_op_index = 0

-        get_type_var_name = comm_info["input_var_reshape_name"][0].split(
-            ".input_reshape@Heter")[0]
-        get_type_var = heter_block.vars[get_type_var_name]
-
-        # create slice op
-        insert_recv_slice_op(
-            heter_program, heter_block, first_op_index,
-            comm_info["block_input_var_name"],
-            (-1, sum(comm_info["input_var_reshape_dim"])), get_type_var.dtype,
-            get_type_var.type, comm_info["input_var_reshape_name"], [
-                (-1, comm_info["input_var_reshape_dim"][i])
-                for i in range(len(comm_info["input_var_reshape_dim"]))
-            ])
-        first_op_index += len(comm_info["input_var_reshape_dim"])
-
-        heter_program.global_block().create_var(
-            name=comm_info["block_input_var_name"],
-            shape=(-1, sum(comm_info["input_var_reshape_dim"])),
-            dtype=get_type_var.dtype,
-            type=get_type_var.type)
-
-        # create reshape op
-        for i in range(len(comm_info["input_var_reshape_name"])):
-            var_name = entrance_vars[i]
-            insert_reshape_op(
-                heter_program,
-                heter_block,
-                first_op_index,
-                comm_info["input_var_reshape_name"][i],
-                var_name, )
-            first_op_index += 1
-
-        first_op_index = len(heter_block.ops)
-
-        # create send reshape op
-        for i in range(len(exit_vars)):
-            insert_reshape_op(heter_program, heter_block, first_op_index,
-                              exit_vars[i],
-                              comm_info["output_var_reshape_name"][i],
-                              [-1, comm_info["output_var_reshape_dim"][i]])
-            first_op_index += 1
-
-        # create send concat op
-        insert_send_concat_op(heter_program, heter_block, first_op_index,
-                              comm_info["output_var_reshape_name"],
-                              comm_info["block_output_var_name"],
-                              [-1, sum(comm_info["output_var_reshape_dim"])])
-        check_op_device(heter_block, current_device)
-
        # add send op
        send_grad_var_list = send_grad_var_list + add_heter_send_op(
            program, heter_program, heter_block, block_var_detail[index])
@@ -525,38 +484,31 @@ def create_heter_program(program, config, heter_program, heter_ops,
    send_input_vars = []
    dummy_output = []
    pserver_endpoints = config.get_ps_endpoints()
-    optimizer_block[-1].append_op(
-        type="send",
-        inputs={"X": send_input_vars},
-        outputs={"Out": dummy_output},
-        attrs={
-            "send_varnames": [STEP_COUNTER],
-            "merge_add": True,
-            "use_send_handler": False,
-            "endpoints": pserver_endpoints
-        })
+    # optimizer_block[-1].append_op(
+    #     type="send",
+    #     inputs={"X": send_input_vars},
+    #     outputs={"Out": dummy_output},
+    #     attrs={
+    #         "send_varnames": [STEP_COUNTER],
+    #         "merge_add": True,
+    #         "use_send_handler": False,
+    #         "endpoints": pserver_endpoints
+    #     })

    # add info in listen&serv
    attrs = {
-        "grad_to_block_id": grad_to_block_id,
-        "sparse_grad_to_param": None,
-        "lr_decay_block_id": None,
-        "dense_optimize_blocks": None,
-        "sparse_optimize_blocks": None,
+        "message_to_block_id": grad_to_block_id,
        "optimize_blocks": optimizer_block,
-
        # runtime attribute
        "endpoint": config.get_heter_worker_endpoint(),
+        "fanin": config.get_trainers(),
        "pserver_id": config.get_role_id(),
-        "Fanin": config.get_trainers(),
        "distributed_mode": config.get_distributed_mode(),
-        "rpc_get_thread_num": int(os.getenv("CPU_NUM", 32)),
-        "rpc_send_thread_num": int(os.getenv("CPU_NUM", 32)),
-        "rpc_prefetch_thread_num": int(os.getenv("CPU_NUM", 32))
+        "rpc_exec_thread_num": int(os.getenv("CPU_NUM", 32))
    }
    # append the listen_and_serv op
    heter_program.global_block().append_op(
-        type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+        type="heter_listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
    check_heter_compile_time_strategy(program, config, send_grad_var_list)


@@ -585,14 +537,15 @@ def create_trainer_program(program, config, heter_ops, block_var_detail):
    #         joint_var.1_2 -> slice -> reshape -> origin_var
    #     d) remove send op which related var@grad is not in trainer program
    # 2. check every op's device
+    static_var = []
    for device in heter_ops.keys():
        for heter_block_index in sorted(heter_ops[device]):
-            replace_ops_by_communicate_op(program, config, heter_block_index,
-                                          heter_ops[device][heter_block_index],
-                                          block_var_detail)
+            static_var += replace_ops_by_communicate_op(
+                program, config, heter_block_index,
+                heter_ops[device][heter_block_index], block_var_detail)
            remove_trainer_send_op(program, config, heter_block_index,
                                   block_var_detail)
-    deleter_trainer_useless_var(program)
+    deleter_trainer_useless_var(config, program, static_var)
    check_op_device(program.global_block(), DEFAULT_DEVICE)


@@ -609,94 +562,28 @@ def replace_ops_by_communicate_op(program, config, heter_block_index, ops_list,
    delete_same_ops(program.global_block(), ops_list)

    mode = config.get_distributed_mode()
-    heter_worker_endpoint = config.get_heter_worker_endpoint()
+    heter_worker_endpoint = config.get_heter_worker_endpoints()
    entrance_var = block_var_detail[heter_block_index]["entrance"]
    exit_var = block_var_detail[heter_block_index]["exit"]

-    default_device_comm_info = get_communicate_var_info(
-        program, heter_block_index - 1,
-        block_var_detail[heter_block_index - 1]["entrance"],
-        block_var_detail[heter_block_index - 1]["exit"])
    comm_info = get_communicate_var_info(program, heter_block_index,
                                         entrance_var, exit_var)

-    # create reshape op
-    for i in range(len(entrance_var)):
-        insert_reshape_op(
-            program,
-            program.global_block(), first_op_idx, entrance_var[i],
-            default_device_comm_info["output_var_reshape_name"][i],
-            [-1, default_device_comm_info["output_var_reshape_dim"][i]])
-        first_op_idx += 1
-
-    # create concat op
-    insert_send_concat_op(
-        program,
-        program.global_block(), first_op_idx,
-        default_device_comm_info["output_var_reshape_name"],
-        default_device_comm_info["block_output_var_name"],
-        [-1, sum(default_device_comm_info["output_var_reshape_dim"])])
-    first_op_idx += 1
-
-    # create send op
-    send_input_vars = [
-        program.global_block().vars[default_device_comm_info[
-            "block_output_var_name"]]
-    ]
-
-    get_type_var_name = comm_info["output_var_reshape_name"][0].split(
-        ".output_reshape@Heter")[0]
-    get_type_var = program.global_block().vars[get_type_var_name]
-
-    program.global_block().create_var(
-        name=comm_info["block_output_var_name"],
-        shape=(-1, sum(comm_info["output_var_reshape_dim"])),
-        dtype=get_type_var.dtype,
-        type=get_type_var.type)
-
-    recv_vars = [
-        program.global_block().vars[comm_info["block_output_var_name"]]
-    ]
-
    program.global_block()._insert_op(
        index=first_op_idx,
        type="send_and_recv",
-        inputs={"X": send_input_vars},
-        outputs={"Out": recv_vars},
+        inputs={"X": program.global_block().vars[entrance_var[0]]},
+        outputs={"Out": program.global_block().vars[exit_var[0]]},
        attrs={
-            "send_var_name": default_device_comm_info["block_output_var_name"],
-            "recv_var_name": comm_info["block_output_var_name"],
-            "endpoint": heter_worker_endpoint,
+            "send_var_name": entrance_var,
+            "recv_var_name": exit_var,
+            "message_name": comm_info["block_input_var_name"],
+            "endpoints": heter_worker_endpoint,
            "trainer_id": config.get_role_id(),
            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
        })
-    first_op_idx += 1
-
-    # recv
-    # create slice op
-    insert_recv_slice_op(
-        program,
-        program.global_block(), first_op_idx,
-        comm_info["block_output_var_name"],
-        (-1, sum(comm_info["output_var_reshape_dim"])), get_type_var.dtype,
-        get_type_var.type, comm_info["output_var_reshape_name"], [
-            (-1, comm_info["output_var_reshape_dim"][i])
-            for i in range(len(comm_info["output_var_reshape_dim"]))
-        ])
-
-    first_op_idx += len(comm_info["output_var_reshape_dim"])
-
-    # create reshape op
-    for i in range(len(comm_info["output_var_reshape_name"])):
-        var_name = comm_info["output_var_reshape_name"][i].split(
-            ".output_reshape@Heter")[0]
-        insert_reshape_op(
-            program,
-            program.global_block(),
-            first_op_idx,
-            comm_info["output_var_reshape_name"][i],
-            var_name, )
-        first_op_idx += 1
+
+    return entrance_var + exit_var


 def remove_trainer_send_op(program, config, heter_block_index,
@@ -732,8 +619,14 @@ def add_heter_send_op(program, heter_program, block, block_var_detail):
                send_op_dict[var] = op
        return send_op_dict

+    # send_Op = { inputs{'X':[]},
+    #             outputs{'Out':dummy_output},
+    #             attrs{'send_varnames'"[]",
+    #                   'is_sparse':int,
+    #                   'table_id':int } }
    send_grad_var_list = []
    send_op_dict = _get_send_op_dict()
+    table_dict = {}
    for persistable_var in block_var_detail["persistables"]:
        # check var_name ==  var@GRAD
        if "@GRAD" not in persistable_var:
@@ -742,9 +635,36 @@ def add_heter_send_op(program, heter_program, block, block_var_detail):
            continue
        if persistable_var not in send_op_dict:
            continue
-        block_append_op(program, heter_program, block,
-                        send_op_dict[persistable_var])
+        send_op = send_op_dict[persistable_var]
+        is_sparse = send_op.attr('is_sparse')
+        table_id = send_op.attr('table_id')
+        send_varnames = send_op.attr('send_varnames')
        send_grad_var_list.append(persistable_var)
+        if table_id not in table_dict:
+            table_dict[table_id] = {}
+            table_dict[table_id]['var_list'] = []
+            table_dict[table_id]['is_sparse'] = is_sparse
+            table_dict[table_id]['send_varnames'] = send_varnames
+        table_dict[table_id]['var_list'].append(persistable_var)
+
+    for table_id in table_dict:
+        dummy_output = block.create_var(
+            name=framework.generate_control_dev_var_name())
+        send_input_vars = [
+            block.vars[union_var]
+            for union_var in table_dict[table_id]['var_list']
+        ]
+        block.append_op(
+            type="send",
+            inputs={"X": send_input_vars},
+            outputs={"Out": dummy_output},
+            attrs={
+                "send_varnames": table_dict[table_id]['send_varnames'],
+                "is_sparse": is_sparse,
+                "table_id": table_id,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
    return send_grad_var_list


@@ -773,10 +693,10 @@ def get_communicate_var_info(program, block_index, entrance_var_list,
    for name in entrance_var_list:
        var = program.global_block().vars[name]
        shape = var.shape
-        if len(shape) < 2 or shape[0] != -1:
-            raise ValueError(
-                "Variable {} not support heter training. its shape is {}".
-                format(name, shape))
+        # if len(shape) < 2 or shape[0] != -1:
+        #     raise ValueError(
+        #         "Variable {} not support heter training. its shape is {}".
+        #         format(name, shape))
        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
        input_var_reshape_dim.append(recv_var_dim)
        input_var_reshape_name.append("{}.input_reshape@Heter".format(name))
@@ -786,10 +706,10 @@ def get_communicate_var_info(program, block_index, entrance_var_list,
    for var_name in exit_var_list:
        var = program.global_block().vars[var_name]
        shape = var.shape
-        if len(shape) < 2 or shape[0] != -1:
-            raise ValueError(
-                "Variable {} not support heter training. its shape is {}".
-                format(var_name, shape))
+        # if len(shape) < 2 or shape[0] != -1:
+        #     raise ValueError(
+        #         "Variable {} not support heter training. its shape is {}".
+        #         format(var_name, shape))
        send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape)
        output_var_reshape_dim.append(send_reshape_dim)
        output_var_reshape_name.append("{}.output_reshape@Heter".format(
@@ -1028,7 +948,10 @@ def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
        index += 1


-def deleter_trainer_useless_var(program):
+def deleter_trainer_useless_var(config, program, static_var):
+    if config.role_maker._is_first_worker():
+        return []
+    static_var = list(set(static_var))
    porgram_useful_var_list = []
    for op in program.global_block().ops:
        input_var_list, output_var_list = find_op_input_output(
@@ -1036,7 +959,7 @@ def deleter_trainer_useless_var(program):
        op_var_list = list(set(input_var_list).union(set(output_var_list)))
        porgram_useful_var_list = list(
            set(porgram_useful_var_list).union(set(op_var_list)))
-
+    porgram_useful_var_list += static_var
    program_useless_var_list = list(
        set(get_vars_name_in_block(program.global_block())).difference(
            set(porgram_useful_var_list)))

--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -20,6 +20,9 @@ set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES}
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

+# for coverage
+LIST(REMOVE_ITEM TEST_OPS test_custom_op)
+
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -16,7 +16,6 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
@@ -108,19 +107,14 @@ if(NOT WITH_DISTRIBUTE OR WIN32)
    LIST(REMOVE_ITEM TEST_OPS test_fleet_ps)
    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_2)
    LIST(REMOVE_ITEM TEST_OPS test_fleet_utils)
-    LIST(REMOVE_ITEM TEST_OPS test_lookup_sparse_table_split_op)

    # TODO: Fix these unittests failed on Windows
    list(REMOVE_ITEM TEST_OPS test_fake_init_op)
-    list(REMOVE_ITEM TEST_OPS test_merge_ids_op)
-    list(REMOVE_ITEM TEST_OPS test_split_ids_op)
-    LIST(REMOVE_ITEM TEST_OPS test_ref_by_trainer_id_op)
 endif()

 if(NOT WITH_DISTRIBUTE)
    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
    LIST(REMOVE_ITEM TEST_OPS test_desc_clone_dist)
-    LIST(REMOVE_ITEM TEST_OPS test_program_code_dist)
 endif()

 if(WIN32)
@@ -137,6 +131,7 @@ LIST(REMOVE_ITEM TEST_OPS test_hdfs1)
 LIST(REMOVE_ITEM TEST_OPS test_hdfs2)
 LIST(REMOVE_ITEM TEST_OPS test_hdfs3)
 LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
+
 if(APPLE OR WIN32)
    LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
    LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
@@ -206,9 +201,7 @@ if(WITH_COVERAGE OR WIN32 OR WITH_NV_JETSON)
  list(REMOVE_ITEM TEST_OPS test_pyramid_hash_op)
 endif()

-if(NOT WITH_DISTRIBUTE OR WITH_COVERAGE OR WIN32 OR WITH_NV_JETSON)
-  list(REMOVE_ITEM TEST_OPS test_fleet_pyramid_hash)
-endif()
+list(REMOVE_ITEM TEST_OPS test_fleet_pyramid_hash)

 if(WITH_GPU OR NOT WITH_MKLML)
    # matmul with multiple heads need MKL support

--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,6 +28,8 @@ import numpy as np

 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.distributed.fleet.utils.ps_util import Distributed
+import paddle.distributed.fleet as fleet

 paddle.enable_static()

@@ -52,7 +54,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
    For test CTR model, using Fleet api
    """

-    def net(self, args, batch_size=4, lr=0.01):
+    def net(self, args, is_train=True, batch_size=4, lr=0.01):
        """
        network definition

@@ -86,13 +88,20 @@ class TestDistCTR2x2(FleetDistRunnerBase):
        datas = [dnn_data, lr_data, label]

        if args.reader == "pyreader":
+            if is_train:
                self.reader = fluid.io.PyReader(
                    feed_list=datas,
                    capacity=64,
                    iterable=False,
                    use_double_buffer=False)
+            else:
+                self.test_reader = fluid.io.PyReader(
+                    feed_list=datas,
+                    capacity=64,
+                    iterable=False,
+                    use_double_buffer=False)

-        # build dnn model
+# build dnn model
        dnn_layer_dims = [128, 128, 64, 32, 1]
        dnn_embedding = fluid.layers.embedding(
            is_distributed=False,
@@ -156,6 +165,42 @@ class TestDistCTR2x2(FleetDistRunnerBase):
        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
            wn.write(str(program))

+    def do_distributed_testing(self, args, test_main_program,
+                               test_startup_program):
+        """
+        do distributed
+        """
+        device_env = os.getenv("DEVICE", 'cpu')
+        if device_env == 'cpu':
+            device = fluid.CPUPlace()
+        elif device_env == 'gpu':
+            device = fluid.CUDAPlace(0)
+        exe = fluid.Executor(device)
+
+        batch_size = 4
+        test_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
+        self.test_reader.decorate_sample_list_generator(test_reader)
+
+        pass_start = time.time()
+        batch_idx = 0
+
+        self.test_reader.start()
+        try:
+            while True:
+                batch_idx += 1
+                loss_val = exe.run(program=test_main_program,
+                                   fetch_list=[self.avg_cost.name])
+                loss_val = np.mean(loss_val)
+                message = "TEST ---> batch_idx: {} loss: {}\n".format(batch_idx,
+                                                                      loss_val)
+                fleet.util.print_on_rank(message, 0)
+        except fluid.core.EOFException:
+            self.test_reader.reset()
+
+        pass_time = time.time() - pass_start
+        message = "Distributed Test Succeed, Using Time {}\n".format(pass_time)
+        fleet.util.print_on_rank(message, 0)
+
    def do_pyreader_training(self, fleet):
        """
        do training using dataset, using fetch handler to catch variable
@@ -168,7 +213,6 @@ class TestDistCTR2x2(FleetDistRunnerBase):
        elif device_env == 'gpu':
            device = fluid.CUDAPlace(0)
        exe = fluid.Executor(device)
-
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

@@ -202,7 +246,6 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
        self.check_model_right(model_dir)
        shutil.rmtree(model_dir)
-        fleet.stop_worker()

    def do_dataset_training(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()
@@ -253,8 +296,5 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

-        fleet.stop_worker()
-
-
 if __name__ == "__main__":
    runtime_main(TestDistCTR2x2)
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -94,7 +94,6 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
        if fleet.is_first_worker():
            fleet.save_persistables(executor=exe, dirname=model_dir)
        shutil.rmtree(model_dir)
-        fleet.stop_worker()

    def do_dataset_training(self, fleet):
        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
@@ -145,8 +144,6 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
                fleet.save_persistables(executor=exe, dirname=model_dir)
            shutil.rmtree(model_dir)

-        fleet.stop_worker()
-

 if __name__ == "__main__":
    runtime_main(TestDistGpuPsCTR2x2)
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -173,7 +173,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
            model_path = tempfile.mkdtemp()
            fleet.save_persistables(executor=exe, dirname=model_path)
            shutil.rmtree(model_path)
-        fleet.stop_worker()

    def do_dataset_training(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()
@@ -211,9 +210,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
            pass_time = time.time() - pass_start
            print("do_dataset_training done. using time {}".format(pass_time))

-        fleet.stop_worker()
-        print("do_dataset_training stop worker.")
-

 if __name__ == "__main__":
    runtime_main(TestHeterPsCTR2x2)
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -242,7 +242,6 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
                pass_time = time.time() - pass_start
            except fluid.core.EOFException:
                self.reader.reset()
-        fleet.stop_worker()

    def do_dataset_training(self, fleet):
        pass

--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -177,7 +177,6 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
-        fleet.stop_worker()


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -14,21 +14,19 @@

 from __future__ import print_function

+import os
 import unittest
 import time
 import threading
 import numpy

 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.communicator import Communicator
-
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
-
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+

 class TestCommunicator(unittest.TestCase):
    def net(self):
@@ -50,10 +48,15 @@ class TestCommunicator(unittest.TestCase):
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)
-        strategy = StrategyFactory.create_async_strategy()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
+
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

+        os.environ["TEST_MODE"] = "1"
        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()

--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -24,10 +24,8 @@ import numpy

 import paddle
 import paddle.fluid as fluid
-
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker

 paddle.enable_static()

@@ -71,19 +69,22 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

-        exe.run(fleet.startup_program)
+        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        train_reader = paddle.batch(self.fake_reader(), batch_size=24)
        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])

        for batch_id, data in enumerate(train_reader()):
-            exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])
+            exe.run(paddle.static.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])

        fleet.stop_worker()

    def run_ut(self):
-        strategy = StrategyFactory.create_half_async_strategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True

        training_role = os.getenv("TRAINING_ROLE", "TRAINER")

@@ -91,7 +92,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
            current_id=0,
            role=role_maker.Role.WORKER
            if training_role == "TRAINER" else role_maker.Role.SERVER,
-            worker_num=2,
+            worker_num=1,
            server_endpoints=["127.0.0.1:6002"])

        if training_role == "TRAINER":
@@ -112,15 +113,12 @@ import subprocess
 import unittest
 import numpy

+from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
+
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.communicator import Communicator
-from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
-
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker

 paddle.enable_static()


--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -19,6 +19,8 @@ import time

 import os
 import paddle
+paddle.enable_static()
+
 import paddle.fluid as fluid

 import paddle.distributed.fleet.base.role_maker as role_maker
@@ -56,6 +58,7 @@ class TestCommunicator(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

+        os.environ["TEST_MODE"] = "1"
        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()

--- a/python/paddle/fluid/tests/unittests/test_desc_clone_dist.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone_dist.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-
-import paddle
-import paddle.fluid as fluid
-
-from test_desc_clone import get_model, program_equal
-
-
-def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        trainer_id=trainer_id,
-        program=main_program,
-        pservers=pserver_endpoints,
-        trainers=trainers)
-    return t
-
-
-class TestDistMnist(unittest.TestCase):
-    def test_desc_clone(self):
-        paddle.enable_static()
-        get_model(batch_size=20)
-
-        pserver_endpoints = "127.0.0.1:9123"
-        trainers = 1
-        current_endpoint = "127.0.0.1:9123"
-        t = get_transpiler(0,
-                           fluid.default_main_program(), pserver_endpoints,
-                           trainers)
-
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-        main = pserver_prog.clone()
-        startup = startup_prog.clone()
-        self.assertTrue(program_equal(main, pserver_prog))
-        self.assertTrue(program_equal(startup, startup_prog))
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -18,6 +18,7 @@ import unittest

 import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid.transpiler.details.program_utils as pu

 paddle.enable_static()

@@ -51,14 +52,15 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        avg_cost = paddle.fluid.layers.mean(cost)

        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.a_sync = True
+        strategy.a_sync = False
        strategy.a_sync_configs = {"launch_barrier": False}
+
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        prog = paddle.fluid.default_main_program()
-        self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier")
+        self.assertEqual(prog.global_block().ops[-1].type, "send_barrier")

        sends = 0
        sgds = 0
@@ -67,7 +69,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                sends += 1
            if op.type == "sgd":
                sgds += 1
-        self.assertEqual(sends, 1)
+        self.assertEqual(sends, 0)
        self.assertEqual(sgds, 0)

        fleet.init_worker()
@@ -98,8 +100,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

-        prog = paddle.fluid.default_main_program()
-        self.assertEqual(prog.global_block().ops[0].type, "listen_and_serv")
        fleet.init_server()



--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -43,11 +43,14 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+
+        input_x = paddle.fluid.layers.data(name="x", shape=[1], dtype='int64')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        emb = paddle.fluid.layers.embedding(
+            input=input_x, size=[100, 10], is_sparse=True)
+
+        fc_1 = paddle.fluid.layers.fc(input=emb, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -57,23 +57,12 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}
+
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-        prog = paddle.fluid.default_main_program()
-        self.assertEqual(prog.global_block().ops[-1].type, "send")

-        sends = 0
-        sgds = 0
-
-        for op in prog.global_block().ops:
-            if op.type == "send":
-                sends += 1
-            if op.type == "sgd":
-                sgds += 1
-        self.assertEqual(sends, 1)
-        self.assertEqual(sgds, 6)
+        with self.assertRaises(ValueError):
+            optimizer.minimize(avg_cost)

    def test_a_sync_optimizer_pserver(self):
        os.environ["TRAINING_ROLE"] = "PSERVER"
@@ -100,6 +89,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}
+
        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+from paddle.distributed.fleet.utils.ps_util import Distributed

 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']

@@ -154,6 +155,10 @@ class FleetDistRunnerBase(object):
        raise NotImplementedError(
            "do_pyreader_training should be implemented by child classes.")

+    def do_distributed_testing(self, fleet):
+        raise NotImplementedError(
+            "do_distributed_testing should be implemented by child classes.")
+

 class TestFleetBase(unittest.TestCase):
    """
@@ -175,6 +180,7 @@ class TestFleetBase(unittest.TestCase):
        self._reader = "pyreader"
        self._trainers = 2
        self._pservers = 2
+        self._need_test = 0
        self._port_set = set()

        global DIST_UT_PORT
@@ -262,15 +268,15 @@ class TestFleetBase(unittest.TestCase):
            python_path += " -m coverage run --branch -p"
        env.update(envs)

-        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+        tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --test {9}".format(
            python_path, model, self._ps_endpoints, self._tr_endpoints,
            self._trainers, self._mode, self._geo_sgd_need_push_nums,
-            self._reader, gloo_path)
+            self._reader, gloo_path, self._need_test)

-        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
+        ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8} --test {9}".format(
            python_path, model, self._ps_endpoints, self._tr_endpoints,
            self._trainers, self._mode, self._geo_sgd_need_push_nums,
-            self._reader, gloo_path)
+            self._reader, gloo_path, self._need_test)

        # Run dist train to compare with local results
        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
@@ -362,6 +368,7 @@ def runtime_main(test_class):
    parser.add_argument(
        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
    parser.add_argument('--reader', type=str, required=False, default='dataset')
+    parser.add_argument('--test', type=int, required=False, default=0)
    args = parser.parse_args()

    model = test_class()
@@ -377,3 +384,28 @@ def runtime_main(test_class):
            model.run_dataset_trainer(args)
        else:
            model.run_pyreader_trainer(args)
+
+        if args.test:
+            test_origin_program = fluid.Program()
+            test_startup_program = fluid.Program()
+            with fluid.program_guard(
+                    main_program=test_origin_program,
+                    startup_program=test_startup_program):
+                with fluid.unique_name.guard():
+                    avg_cost = model.net(args, is_train=False)
+            send_ctx = fleet.fleet._runtime_handle._communicator.send_ctx_
+            varname2tables = {}
+            for gradname, ctx in send_ctx.items():
+                if ctx.is_sparse:
+                    param = gradname.strip("@GRAD")
+                    varname2tables[param] = ctx.table_id()
+                else:
+                    continue
+            ps_util = Distributed()
+            test_main_program = ps_util.estimate(test_origin_program,
+                                                 varname2tables)
+            print(str(test_main_program))
+            print(str(test_startup_program))
+            model.do_distributed_testing(args, test_main_program,
+                                         test_startup_program)
+        fleet.stop_worker()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -24,6 +24,7 @@ class TestDistMnistSync2x2(TestFleetBase):
    def _setup_config(self):
        self._mode = "sync"
        self._reader = "pyreader"
+        self._need_test = 1

    def check_with_place(self,
                         model_file,
@@ -52,6 +53,7 @@ class TestDistMnistSync2x2(TestFleetBase):
            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)


+@unittest.skip(reason="Skip unstable ut, open it when geo fixed")
 class TestDistMnistAuto2x2(TestFleetBase):
    def _setup_config(self):
        self._mode = "auto"
@@ -116,7 +118,7 @@ class TestDistMnistAsync2x2(TestFleetBase):
            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)


-@unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
+# @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
 class TestDistMnistAsyncDataset2x2(TestFleetBase):
    def _setup_config(self):
        self._mode = "async"

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -16,14 +16,13 @@ from __future__ import print_function

 import os
 import unittest
+import paddle
 import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
 from test_dist_fleet_base import TestFleetBase
 from dist_fleet_simnet_bow import train_network
-import paddle
-
 paddle.enable_static()


@@ -73,7 +72,9 @@ class TestGeoSgdTranspiler(unittest.TestCase):
        is_sparse = True
        is_distribute = False

-        strategy = StrategyFactory.create_geo_strategy(5)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}

        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)

@@ -81,9 +82,6 @@ class TestGeoSgdTranspiler(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

-        pserver_startup_program = fleet.startup_program
-        pserver_mian_program = fleet.main_program
-

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -81,7 +81,10 @@ class FleetDistHeterRunnerBase(object):
    def build_strategy(self, args):
        self.strategy = paddle.distributed.fleet.DistributedStrategy()
        self.strategy.a_sync = True
-        self.strategy.a_sync_configs = {"launch_barrier": True}
+        self.strategy.a_sync_configs = {
+            "launch_barrier": True,
+            "heter_worker_device_guard": 'gpu'
+        }
        return self.strategy

    def build_optimizer(self, avg_cost, strategy):
@@ -366,3 +369,4 @@ def runtime_main(test_class):
            model.run_dataset_trainer(args)
        else:
            model.run_pyreader_trainer(args)
+        fleet.stop_worker()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -14,15 +14,16 @@

 from __future__ import print_function

+import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
-import paddle

+import paddle
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
@@ -159,7 +160,7 @@ class TestPSPassWithBow(unittest.TestCase):
            "127.0.0.1:36007"
        ]

-        role = role_maker.UserDefinedRoleMaker(
+        role = fleet.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
@@ -168,7 +169,10 @@ class TestPSPassWithBow(unittest.TestCase):
        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.SGD(base_lr)
-        strategy = StrategyFactory.create_sync_strategy()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)


--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -157,8 +157,8 @@ class TestPSPassWithBow(unittest.TestCase):
        os.environ["PADDLE_PORT"] = "36001"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-            "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
        os.environ["TRAINING_ROLE"] = "PSERVER"

        role = role_maker.PaddleCloudRoleMaker()
@@ -171,28 +171,8 @@ class TestPSPassWithBow(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(loss)

-        model_dir = tempfile.mkdtemp()
-
-        with self.assertRaises(ValueError):
-            fleet.init_server(os.path.join(model_dir, "temp"), "xxxx")
-
-        with self.assertRaises(ValueError):
-            fleet.init_server(os.path.join(model_dir, "temp"))
-
        fleet.init_server()

-        from paddle.fluid.communicator import LargeScaleKV
-        kv = LargeScaleKV()
-
-        kv.save("__emb__.block0",
-                os.path.join(model_dir, "__emb__", "__emb__.block0"))
-
-        kv.size("__emb__.block0")
-
-        fluid.framework.switch_main_program(fluid.Program())
-        fleet.init_server(model_dir)
-        shutil.rmtree(model_dir)
-

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -14,15 +14,16 @@

 from __future__ import print_function

+import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
-import paddle

+import paddle
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
@@ -159,7 +160,7 @@ class TestPSPassWithBow(unittest.TestCase):
            "127.0.0.1:36007"
        ]

-        role = role_maker.UserDefinedRoleMaker(
+        role = fleet.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
@@ -168,7 +169,11 @@ class TestPSPassWithBow(unittest.TestCase):
        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.SGD(base_lr)
-        strategy = StrategyFactory.create_geo_strategy(20)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {"k_steps": 100}
+
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)


--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -14,15 +14,16 @@

 from __future__ import print_function

+import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
-import paddle

+import paddle
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
@@ -162,7 +163,10 @@ class TestPSPassWithBow(unittest.TestCase):
        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.Adam(base_lr)
-        strategy = StrategyFactory.create_async_strategy()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)


--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -14,15 +14,16 @@

 from __future__ import print_function

+import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
-import paddle

+import paddle
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
@@ -168,14 +169,16 @@ class TestPSPassWithBow(unittest.TestCase):
        fleet.init(role)
        loss, acc, _ = self.net()

-        optimizer = fluid.optimizer.Adagrad(
+        optimizer = fluid.optimizer.Adam(
            learning_rate=fluid.layers.exponential_decay(
                learning_rate=base_lr,
                decay_steps=500,
                decay_rate=0.969,
                staircase=True))

-        strategy = StrategyFactory.create_async_strategy()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)


--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -14,15 +14,16 @@

 from __future__ import print_function

+import os
 import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory

+import paddle
 paddle.enable_static()

+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
@@ -161,8 +162,10 @@ class TestPSPassWithBow(unittest.TestCase):

        fleet.init(role)
        loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.Adagrad(base_lr)
-        strategy = StrategyFactory.create_async_strategy()
+        optimizer = fluid.optimizer.Adam(base_lr)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)


--- a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -24,6 +24,7 @@ import paddle
 paddle.enable_static()


+@unittest.skip("do not need currently")
 class TestLookupTableFuseOp(unittest.TestCase):
    def test_fuse(self):
        places = [core.CPUPlace()]

--- a/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,24 +13,29 @@
 # limitations under the License.

 import unittest
-import numpy as np
-from op_test import OpTest

+import paddle
+paddle.enable_static()

-class TestRefByTrainerIdOp(OpTest):
-    def setUp(self):
-        self.op_type = "ref_by_trainer_id"
-        param_baks = [("x%d" % x, np.random.random((10, 10)).astype("float32"))
-                      for x in range(10)]
-        self.inputs = {
-            'X': param_baks,
-            'TrainerId': np.array([8]).astype("int64")
-        }
-        self.outputs = {'Out': param_baks[8][1]}
+from paddle.distributed.fleet.runtime.the_one_ps import Table

-    def test_check_output(self):
-        self.check_output()

+class TestTable(unittest.TestCase):
+    def test_table_tensor(self):
+        table = Table()
+        table.id = 1001
+        table.table_class = "SPARSE_TABLE"
+        table.shard_num = -1
+        table.type = None
+        table.accessor = None
+        table.common = None
+        table.tensor = None

-if __name__ == "__main__":
+        pt = """  downpour_table_param {table_id: 1001 table_class: "SPARSE_TABLE" shard_num: -1 type: None
+
+  }"""
+        self.assertEqual(table.to_string(0), pt)
+
+
+if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -70,6 +70,7 @@ class SparseLoadOp(unittest.TestCase):
        return model_path


+@unittest.skip(reason="Skip unstable ut, need rewrite with new implement")
 class TestSparseLoadOpCase1(SparseLoadOp):
    def test_2ps_0_load(self):
        # init No.0 server env

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
@@ -27,6 +27,7 @@ from paddle.distributed.fleet import fleet
 from test_dist_sparse_load_ps0 import SparseLoadOp


+@unittest.skip(reason="Skip unstable ut, need rewrite with new implement")
 class TestSparseLoadOpCase2(SparseLoadOp):
    def test_2ps_0_load(self):
        # init No.1 server env

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
@@ -36,7 +36,7 @@ class TestSparseLoadProgramAdagrad(TestSparseLoadProgram):
        scope, train_program, startup_program, loss = self.net()
        with fluid.scope_guard(scope):
            with fluid.program_guard(train_program, startup_program):
-                optimizer = fluid.optimizer.Adagrad(1e-3)
+                optimizer = fluid.optimizer.Adam(1e-3)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        self.strategy)
                optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
@@ -36,7 +36,7 @@ class TestSparseLoadProgramFtrl(TestSparseLoadProgram):
        scope, train_program, startup_program, loss = self.net()
        with fluid.scope_guard(scope):
            with fluid.program_guard(train_program, startup_program):
-                optimizer = fluid.optimizer.Ftrl(1e-3)
+                optimizer = fluid.optimizer.SGD(1e-3)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        self.strategy)
                optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
@@ -36,7 +36,7 @@ class TestSparseLoadProgramMomentum(TestSparseLoadProgram):
        scope, train_program, startup_program, loss = self.net()
        with fluid.scope_guard(scope):
            with fluid.program_guard(train_program, startup_program):
-                optimizer = fluid.optimizer.Momentum(1e-3, 0.9)
+                optimizer = fluid.optimizer.SGD(1e-3)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        self.strategy)
                optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
@@ -36,7 +36,7 @@ class TestSparseLoadProgramRmsprop(TestSparseLoadProgram):
        scope, train_program, startup_program, loss = self.net()
        with fluid.scope_guard(scope):
            with fluid.program_guard(train_program, startup_program):
-                optimizer = fluid.optimizer.RMSProp(1e-3)
+                optimizer = fluid.optimizer.SGD(1e-3)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        self.strategy)
                optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import gc
-import paddle.fluid as fluid
-import paddle
-
-paddle.enable_static()
-
-
-class TranspilerAsyncLRDecayTest(unittest.TestCase):
-    def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        # NOTE: we do not actually bind this port
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-        self.pserver1_ep = "127.0.0.1:6174"
-        self.pserver2_ep = "127.0.0.1:6175"
-        self.sync_mode = False
-        self.transpiler = None
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=0.1,
-                decay_steps=100,
-                decay_rate=0.99,
-                staircase=True))
-        sgd_optimizer.minimize(avg_cost)
-
-    def get_main_program(self):
-        main = fluid.Program()
-        main.random_seed = 1
-        with fluid.program_guard(main):
-            self.net_conf()
-        self.origin_prog = main.clone()
-        return main
-
-    def get_trainer(self, config=None):
-        src = fluid.default_startup_program().clone()
-
-        t = self._transpiler_instance(config)
-
-        trainer_main = t.get_trainer_program(wait_port=False)
-        trainer_startup = fluid.default_startup_program()
-
-        assert (src.num_blocks == 1)
-        assert (trainer_startup.num_blocks == src.num_blocks)
-
-        return trainer_main, trainer_startup
-
-    def get_pserver(self, ep, config=None, sync_mode=True):
-        t = self._transpiler_instance(config, sync_mode)
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self, config=None, sync_mode=True):
-        if not self.transpiler:
-            main = self.get_main_program()
-            self.transpiler = fluid.DistributeTranspiler(config=config)
-            self.transpiler.transpile(
-                self.trainer_id,
-                program=main,
-                pservers=self.pserver_eps,
-                trainers=self.trainers,
-                sync_mode=sync_mode)
-
-        return self.transpiler
-
-    def transpiler_test_impl(self):
-        pserver, startup = self.get_pserver(self.pserver1_ep, sync_mode=False)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep, sync_mode=False)
-
-        trainer, trainer_startup = self.get_trainer()
-
-        src = [op.type for op in trainer_startup.global_block().ops]
-        dst = ['fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', \
-               'uniform_random', 'recv', 'recv', 'fetch_barrier', 'concat']
-        self.assertEqual(src, dst)
-
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
-            'send', 'recv', 'recv', 'concat'
-        ])
-
-        self.assertEqual(len(pserver.blocks), 4)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
-        self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
-            "scale"
-        ])
-
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[2].ops], ["sgd"])
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops], [
-            "fill_constant", "fill_constant", "fill_constant", "fill_constant",
-            "uniform_random"
-        ])
-
-    def test_transpiler(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                self.transpiler_test_impl()
-        # NOTE: run gc.collect to eliminate pybind side objects to
-        # prevent random double-deallocate when inherited in python.
-        del self.transpiler
-        del main
-        del startup
-        gc.collect()
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-import gc
-import paddle
-
-paddle.enable_static()
-
-gc.set_debug(gc.DEBUG_COLLECTABLE)
-
-
-class TranspilerTest(unittest.TestCase):
-    def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        # NOTE: we do not actually bind this port
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-        self.pserver1_ep = "127.0.0.1:6174"
-        self.pserver2_ep = "127.0.0.1:6175"
-        self.sync_mode = True
-        self.transpiler = None
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'),
-                                    bias_attr=fluid.ParamAttr(name='fc_b'))
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-        sgd_optimizer.minimize(avg_cost)
-
-    def get_main_program(self):
-        main = fluid.Program()
-        main.random_seed = 1
-        with fluid.program_guard(main):
-            self.net_conf()
-        self.origin_prog = main.clone()
-        return main
-
-    def get_trainer(self, config=None, sync_mode=True):
-        src = fluid.default_startup_program().clone()
-
-        t = self._transpiler_instance(config, sync_mode=True)
-
-        trainer_main = t.get_trainer_program(wait_port=False)
-        trainer_startup = fluid.default_startup_program()
-
-        assert (src.num_blocks == 1)
-        assert (trainer_startup.num_blocks == src.num_blocks)
-
-        return trainer_main, trainer_startup
-
-    def get_pserver(self, ep, config=None, sync_mode=True):
-        t = self._transpiler_instance(config, sync_mode)
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self, config=None, sync_mode=True):
-        if not self.transpiler:
-            main = self.get_main_program()
-            self.transpiler = fluid.DistributeTranspiler(config=config)
-            self.transpiler.transpile(
-                self.trainer_id,
-                program=main,
-                pservers=self.pserver_eps,
-                trainers=self.trainers,
-                sync_mode=sync_mode)
-
-        return self.transpiler
-
-    def transpiler_test_impl(self):
-        pass
-
-    def test_transpiler(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                self.transpiler_test_impl()
-        # NOTE: run gc.collect to eliminate pybind side objects to
-        # prevent random double-deallocate when inherited in python.
-        del self.transpiler
-        del main
-        del startup
-        gc.collect()
-
-
-class TestBasicModelAsync(TranspilerTest):
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.sync_mode = False
-        config.runtime_split_send_recv = True
-
-        pserver, startup = self.get_pserver(self.pserver1_ep, config, False)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config, False)
-
-        trainer, _ = self.get_trainer(config, False)
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'recv', 'recv'
-        ])
-        self.assertEqual(len(pserver.blocks), 3)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        self.assertEqual(pserver.blocks[0].ops[0].attr("distributed_mode"), 1)
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[2].ops], ["sgd"])
-
-
-class TestBasicModelHalfAsync(TranspilerTest):
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.sync_mode = False
-        config.runtime_split_send_recv = False
-
-        pserver, startup = self.get_pserver(self.pserver1_ep, config, False)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config, False)
-
-        trainer, _ = self.get_trainer(config, False)
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
-            'recv', 'recv', 'concat'
-        ])
-        self.assertEqual(len(pserver.blocks), 3)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        self.assertEqual(pserver.blocks[0].ops[0].attr("distributed_mode"), 2)
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[2].ops], ["sgd"])
-
-
-class TestBasicModelSync(TranspilerTest):
-    def transpiler_test_impl(self):
-        config = fluid.DistributeTranspilerConfig()
-        config.sync_mode = True
-        config.runtime_split_send_recv = False
-
-        pserver, startup = self.get_pserver(self.pserver1_ep, config, True)
-        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config, True)
-
-        trainer, _ = self.get_trainer(config, True)
-        self.assertEqual([op.type for op in trainer.global_block().ops], [
-            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
-            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
-            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
-        ])
-
-        self.assertEqual(len(pserver.blocks), 3)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        self.assertEqual(pserver.blocks[0].ops[0].attr("distributed_mode"), 0)
-        # block1~2: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[2].ops],
-                         ["sum", "scale", "sgd"])
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fleet_metric.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
@@ -19,8 +19,12 @@ import paddle
 import paddle.fluid as fluid
 import os
 import unittest
+import numpy as np
 import paddle.distributed.fleet.metrics.metric as metric
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.base.util_factory import UtilBase
+
+paddle.enable_static()


 class TestFleetMetric(unittest.TestCase):
@@ -29,6 +33,23 @@ class TestFleetMetric(unittest.TestCase):
    def setUp(self):
        """Set up, set envs."""

+        class FakeUtil(UtilBase):
+            def __init__(self, fake_fleet):
+                super(UtilBase, self).__init__()
+                self.fleet = fake_fleet
+
+            def all_reduce(self, input, mode="sum", comm_world="worker"):
+                input = np.array(input)
+                input_shape = input.shape
+                input_list = input.reshape(-1).tolist()
+
+                self.fleet._barrier(comm_world)
+
+                ans = self.fleet._all_reduce(input_list, mode)
+
+                output = np.array(ans).reshape(input_shape)
+                return output
+
        class FakeFleet:
            """Fake fleet only for test."""

@@ -42,19 +63,16 @@ class TestFleetMetric(unittest.TestCase):
                self.gloo.set_hdfs_store("./tmp_test_metric", "", "")
                self.gloo.init()

-            def _all_reduce(self, input, output, mode="sum"):
+            def _all_reduce(self, input, mode="sum"):
                """All reduce using gloo."""
-                input_list = [i for i in input]
-                ans = self.gloo.all_reduce(input_list, mode)
-                for i in range(len(ans)):
-                    output[i] = 1
+                ans = self.gloo.all_reduce(input, mode)
+                return ans

-            def _barrier_worker(self):
-                """Fake barrier worker, do nothing."""
+            def _barrier(self, comm_world="worker"):
+                """Fake barrier, do nothing."""
                pass

-        self.fleet = FakeFleet()
-        fleet._role_maker = self.fleet
+        self.util = FakeUtil(FakeFleet())

    def test_metric_1(self):
        """Test cases for metrics."""
@@ -78,34 +96,34 @@ class TestFleetMetric(unittest.TestCase):
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
-            metric.sum(t, scope)
-            metric.max(t, scope)
-            metric.min(t, scope)
-            metric.auc(t, t1, scope)
-            metric.mae(t1, 3, scope)
-            metric.rmse(t1, 3, scope)
-            metric.mse(t1, 3, scope)
-            metric.acc(t, t1, scope)
-            metric.sum(str(t.name), scope)
-            metric.max(str(t.name), scope)
-            metric.min(str(t.name), scope)
-            metric.auc(str(t1.name), str(t.name), scope)
-            metric.mae(str(t1.name), 3, scope)
-            metric.rmse(str(t1.name), 3, scope)
-            metric.mse(str(t1.name), 3, scope)
-            metric.acc(str(t.name), str(t1.name), scope)
+            metric.sum(t, scope, self.util)
+            metric.max(t, scope, self.util)
+            metric.min(t, scope, self.util)
+            metric.auc(t, t1, scope, self.util)
+            metric.mae(t1, 3, scope, self.util)
+            metric.rmse(t1, 3, scope, self.util)
+            metric.mse(t1, 3, scope, self.util)
+            metric.acc(t, t1, scope, self.util)
+            metric.sum(str(t.name), scope, self.util)
+            metric.max(str(t.name), scope, self.util)
+            metric.min(str(t.name), scope, self.util)
+            metric.auc(str(t1.name), str(t.name), scope, self.util)
+            metric.mae(str(t1.name), 3, scope, self.util)
+            metric.rmse(str(t1.name), 3, scope, self.util)
+            metric.mse(str(t1.name), 3, scope, self.util)
+            metric.acc(str(t.name), str(t1.name), scope, self.util)
        arr = np.array([1, 2, 3, 4])
-        metric.sum(arr)
-        metric.max(arr)
-        metric.min(arr)
+        metric.sum(arr, util=self.util)
+        metric.max(arr, util=self.util)
+        metric.min(arr, util=self.util)
        arr1 = np.array([[1, 2, 3, 4]])
        arr2 = np.array([[1, 2, 3, 4]])
        arr3 = np.array([1, 2, 3, 4])
-        metric.auc(arr1, arr2)
-        metric.mae(arr, 3)
-        metric.rmse(arr, 3)
-        metric.mse(arr, 3)
-        metric.acc(arr, arr3)
+        metric.auc(arr1, arr2, util=self.util)
+        metric.mae(arr, 3, util=self.util)
+        metric.rmse(arr, 3, util=self.util)
+        metric.mse(arr, 3, util=self.util)
+        metric.acc(arr, arr3, util=self.util)


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -145,59 +145,8 @@ class TestListenAndServOp(unittest.TestCase):
                start_left_time -= sleep_time

    def test_rpc_interfaces(self):
-        # TODO(Yancey1989): need to make sure the rpc interface correctly.
        pass

-    def test_handle_signal_in_serv_op(self):
-        # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True, run_pserver)
-        print("test_handle_signal_in_serv_op before _wait_ps_ready")
-        self._wait_ps_ready(p1.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGINT)
-        print("test_handle_signal_in_serv_op after kill pid:", p1.pid)
-        p1.join()
-
-        # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False, run_pserver)
-        print("test_handle_signal_in_serv_op after start p2 pid:", p2.pid)
-        self._wait_ps_ready(p2.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGTERM)
-        print("test_handle_signal_in_serv_op before join p2 pid:", p2.pid)
-        p2.join()
-
-        gen_complete_file_flag("test_handle_signal_in_serv_op.flag")
-
-    def test_list_and_serv_run_empty_optimize_block(self):
-        # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
-        print(
-            "test_list_and_serv_run_empty_optimize_block before _wait_ps_ready")
-        self._wait_ps_ready(p1.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGINT)
-        print("test_list_and_serv_run_empty_optimize_block after kill pid:",
-              p1.pid)
-        p1.join()
-
-        # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
-        print("test_list_and_serv_run_empty_optimize_block after start p2 pid:",
-              p2.pid)
-        self._wait_ps_ready(p2.pid)
-
-        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGTERM)
-        print("test_list_and_serv_run_empty_optimize_block before join p2 pid:",
-              p2.pid)
-        p2.join()
-        gen_complete_file_flag(
-            "test_list_and_serv_run_empty_optimize_block.flag")
-

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_split_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestLookupSpraseTable(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 7
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-        # create and initialize Id Variable
-        ids = scope.var("Ids").get_tensor()
-
-        # create and run lookup_table operator
-        lookup_table = Operator(
-            "lookup_sparse_table_grad_split",
-            Grad='W',
-            Row={'Ids'},
-            Value={'W'},
-            is_entry=False,
-            tablename="sparse")
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array1 = np.array(ids)
-        print(result_array1)
-        print("== = = == == = == ==== ==== === ")
-        value = scope.var("W").get_tensor()
-        result_array1 = np.array(value)
-        print(result_array1.shape)
-        print(result_array1)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestMergeIdsOp(OpTest):
-    def setUp(self):
-        self.op_type = "merge_ids"
-        ids1 = np.array([[0], [2], [5], [6]]).astype('int64')
-        ids2 = np.array([[0], [2], [2], [3]]).astype('int64')
-
-        rows1 = np.array([[0], [2]]).astype('int64')
-        rows2 = np.array([[3], [5]]).astype('int64')
-        rows3 = np.array([[6]]).astype('int64')
-
-        x0 = np.array([[0.1, 0.2], [0.2, 0.3]]).astype('float32')
-        x1 = np.array([[0.3, 0.4], [0.4, 0.5]]).astype('float32')
-        x2 = np.array([[0.5, 0.6]]).astype('float32')
-
-        out1 = np.array(
-            [[0.1, 0.2], [0.2, 0.3], [0.4, 0.5], [0.5, 0.6]]).astype('float32')
-        out2 = np.array(
-            [[0.1, 0.2], [0.2, 0.3], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
-
-        self.inputs = {
-            'Ids': [('ids1', ids1), ('ids2', ids2)],
-            "Rows": [('rows1', rows1), ('rows2', rows2), ('rows3', rows3)],
-            "X": [('x0', x0), ('x1', x1), ('x2', x2)]
-        }
-        self.outputs = {'Out': [('out1', out1), ('out2', out2)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_program_code_dist.py
+++ b/python/paddle/fluid/tests/unittests/test_program_code_dist.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layers.io import ListenAndServ
-from paddle.fluid.layers.io import Recv
-from paddle.fluid.layers.io import Send
-import paddle.fluid.layers.ops as ops
-
-
-class TestProgram2Code(unittest.TestCase):
-    @unittest.skipIf(sys.platform == "win32",
-                     "Windows does not support distribution")
-    def test_print(self):
-        paddle.enable_static()
-        place = fluid.CPUPlace()
-        self.init_serv(place)
-        self.init_client(place, 9123)
-
-    def init_serv(self, place):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
-            with serv.do():
-                out_var = main.global_block().create_var(
-                    name="scale_0.tmp_0",
-                    psersistable=True,
-                    dtype="float32",
-                    shape=[32, 32])
-                x = layers.data(
-                    shape=[32, 32],
-                    dtype='float32',
-                    name="X",
-                    append_batch_size=False)
-                fluid.initializer.Constant(value=1.0)(x, main.global_block())
-                ops._scale(x=x, scale=10.0, out=out_var)
-
-        print(main)
-
-    def init_client(self, place, port):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
-            get_var = main.global_block().create_var(
-                name="scale_0.tmp_0",  # server side var
-                dtype="float32",
-                persistable=False,
-                shape=[32, 32])
-            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-            Send("127.0.0.1:%d" % port, [x])
-            o = Recv("127.0.0.1:%d" % port, [get_var])
-
-        print(main)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_recv_save_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_save_op.py
@@ -65,6 +65,7 @@ def run_pserver(pserver_id):
            exe.run(program)


+@unittest.skip("do not need currently")
 class TestListenAndServOp(unittest.TestCase):
    def setUp(self):
        self.ps_timeout = 5

--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-class TestSplitIdsOp(OpTest):
-    def setUp(self):
-        self.op_type = "split_ids"
-        ids1 = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
-        ids2 = np.array([[6], [2], [3], [3], [5], [2], [6]]).astype('int64')
-        ids3 = np.array([[2], [2], [2], [3], [5], [5], [6]]).astype('int64')
-
-        out0 = np.array([[0], [3], [6]]).astype('int64')
-        out1 = np.array([[]]).astype('int64')
-        out2 = np.array([[2], [5]]).astype('int64')
-        self.inputs = {'Ids': [('ids1', ids1), ('ids2', ids2), ('ids3', ids3)]}
-        self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSplitSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 9]
-        height = 20
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            for j in range(row_numel):
-                np_array[i, j] = rows[i] + j
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        outs_name = ["out%d" % i for i in six.moves.xrange(3)]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out_rows = [[0, 9], [7, 4], [5]]
-
-        op = Operator("split_ids", Ids="X", Out=outs_name)
-
-        for _ in range(3):
-            op.run(scope, place)
-
-            for i in range(len(outs)):
-                expected_rows = expected_out_rows[i]
-                self.assertEqual(outs[i].rows(), expected_rows)
-                for j in range(len(expected_rows)):
-                    row = expected_rows[j]
-                    self.assertAlmostEqual(
-                        float(row), np.array(outs[i].get_tensor())[j, 0])
-                    self.assertAlmostEqual(
-                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
-
-
-if __name__ == '__main__':
-    unittest.main()