diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 902dca209fcc07cc8048acda84e917f0f6c6af68..aa70783ecd68be543b2d5aabee96a5b09bd72e6a 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -180,7 +180,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print_train_time(start_time, time.time(), num_samples)
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
@@ -277,11 +277,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             batch_id += 1
 
         print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                             batch_acc)
             print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
-        exit(0)
 
 
 def print_arguments(args):
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 2ee2b5be09bfcc2e7fcec7eb2f80e28e4e75ab3d..9ed1093c54a501cc93dbbf9c3651fe70914ce26b 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -199,7 +199,10 @@ def get_model(args):
     batched_train_reader = paddle.batch(
         paddle.reader.shuffle(
             train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus)
-    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus,
+        drop_last=True)
+    batched_test_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
 
-    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 89d7a62fe9aca3a71ad34b976a186a80174bfd5e..6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -118,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
 if(WITH_GOLANG)
   # we need to symlink Paddle directory into GOPATH. If we
   # don't do it and we have code that depends on Paddle, go
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 4271e4c1bb6bc7b83f2633191ea2d464f4f56c4c..6bc770580640f242cfce6a9838f00210f785010a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,8 +83,13 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto glog lod_rank_table feed_fetch_method)
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
 
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 542c4a172891ba9d3621918986089b2e400b6ae8..304b221e7e4c414a0ab562a1b99836d3b7c02efb 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-class SSAGraph;
+struct SSAGraph;
 
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
  public:
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index d4d6c34108b9f1e457d8eb0c36d10339b03330bd..4a6f53cba1f46214dbff3058b221f878ecf46613 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -44,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index e6f9c3d31c18f762ef2de269977e0642a79fb174..67a0761dac2a9adcdd0ce2b218c4aa505d688d56 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,6 +44,13 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
+
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
    *
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 3a413941df964c8d9454fafc6030c377c10f9fb1..64d4ceab624312ed366d7e835072899f1f033a88 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -35,14 +35,15 @@ class ReaderBase {
 
 class DecoratedReader : public ReaderBase {
  public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
   void ReInit() override { reader_->ReInit(); }
 
  protected:
-  ReaderBase* reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
 class FileReader : public ReaderBase {
@@ -64,7 +65,7 @@ class ReaderHolder {
  public:
   void Reset(ReaderBase* reader) { reader_.reset(reader); }
 
-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -76,7 +77,7 @@ class ReaderHolder {
   }
 
  private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 6b8373b1509c898e6ae70a18833df39a4898714a..02ffe3651e1deefcf6981c3d304d64b9a01661bf 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -34,6 +34,12 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
+void GRPCClient::SendComplete() {
+  for (auto& it : channels_) {
+    this->AsyncSendComplete(it.first);
+  }
+}
+
 GRPCClient::~GRPCClient() {
   Wait();
   cq_.Shutdown();
@@ -210,6 +216,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   req_count_++;
 }
 
+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(COMPLETE_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return req_count_ == 0; });
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index 8db73f875e3e2048386e91f6b5efb29b4ee7e193..44000c028b499d9ad1a0e0dd40a5e287cd61d143 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -195,6 +195,8 @@ class GRPCClient : public RPCClient {
 
   void Wait() override;
 
+  void SendComplete() override;
+
  protected:
   void InitImpl() override;
 
@@ -204,6 +206,9 @@ class GRPCClient : public RPCClient {
 
   void Proceed();
 
+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = RPCClient::rpc_time_out);
+
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 18651544a1c0207127be335c37fe85c6e24dc16e..2d34f85838c34f1dfe43d2130e127d0258072fa7 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -162,16 +162,18 @@ class RequestPrefetch final : public RequestBase {
 
   void Process() override {
     // prefetch process...
-    std::string varname = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << varname;
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
 
     auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = scope->FindVar(out_var_name);
 
-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
 
-    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                           &reply_);
     Finish(reply_, &responder_);
   }
@@ -287,7 +289,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
   } else if (rpc_name == kRequestPrefetch) {
     b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
   } else {
-    PADDLE_ENFORCE(false, "not surpported rpc");
+    PADDLE_ENFORCE(false, "not supported rpc");
   }
 
   reqs[req_id] = b;
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
index fa979024e37f435b918568a1c5e603f8962f9172..a2d08747d59220d30a5b8fd56074fd2739ae3bab 100644
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -40,6 +40,7 @@ constexpr char kRequestPrefetch[] = "RequestPrefetch";
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
 
 class RPCServer;
 
@@ -60,9 +61,12 @@ class RequestHandler {
   void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
   void SetProgram(framework::ProgramDesc* program) { program_ = program; }
   void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
   void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
   }
 
   // Used for async.
@@ -78,9 +82,6 @@ class RequestHandler {
   bool sync_mode() { return sync_mode_; }
   framework::Scope* scope() { return scope_; }
   const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ExecutorPrepareContext* prefetch_ctx() {
-    return prefetch_ctx_.get();
-  }
   framework::ProgramDesc* program() { return program_; }
   framework::Executor* executor() { return executor_; }
 
@@ -99,8 +100,8 @@ class RequestHandler {
   //           *request_handler_->dev_ctx(), &reply_);
   //    }
   virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var,
-                      framework::Variable** outvar) = 0;
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;
 
  protected:
   const bool sync_mode_;
@@ -109,12 +110,17 @@ class RequestHandler {
   framework::Executor* executor_;
   framework::Scope* scope_;
   framework::ProgramDesc* program_;
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
 
   // Used for async.
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
+
   RPCServer* rpc_server_;
 };
 
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
index 5f1a346e93b1a0239af77b86d10782d67c403e23..7425bee798cd9ba0af8cd777a6db63862c8a4031 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -30,7 +30,8 @@ namespace detail {
 bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Scope* scope,
                                 framework::Variable* invar,
-                                framework::Variable** outvar) {
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
   // Async
@@ -49,6 +50,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
   if (varname == BATCH_BARRIER_MESSAGE) {
     VLOG(3) << "sync: recv batch barrier message";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->DecreaseClientNum();
   } else {
     VLOG(3) << "sync: received var_name: " << varname;
     if (sync_mode_) {
@@ -79,7 +83,8 @@ void RequestSendHandler::ResetSparseVarRecorder() {
 bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
-                               framework::Variable** outvar) {
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
   VLOG(4) << "RequestGetHandler:" << varname;
 
   if (varname != FETCH_BARRIER_MESSAGE) {
@@ -102,13 +107,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Scope* scope,
                                     framework::Variable* invar,
-                                    framework::Variable** outvar) {
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
   VLOG(4) << "RequestPrefetchHandler " << varname;
 
-  auto var_desc = program_->Block(0).FindVar(varname);
-  *outvar = scope->FindVar(varname);
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
   InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
 
   return true;
 }
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
index c392267cfaeb7e94e6a23d6445c09b756e7e58b1..3f77c09a9598b431d747f1b824615e49d939098e 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -39,7 +39,8 @@ class RequestSendHandler final : public RequestHandler {
   explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
   void ResetSparseVarRecorder();
 
  private:
@@ -52,7 +53,8 @@ class RequestGetHandler final : public RequestHandler {
   explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestGetHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };
 
 class RequestPrefetchHandler final : public RequestHandler {
@@ -60,7 +62,8 @@ class RequestPrefetchHandler final : public RequestHandler {
   explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestPrefetchHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h
index 7e76ac0348574d4090793b191be0ff3ff8666b37..47c6ffb4fd7a002fc0bd8053fb3314a2fbf18fd3 100644
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -53,6 +53,11 @@ class RPCClient {
   virtual void AsyncSendFetchBarrier(const std::string& ep,
                                      int64_t time_out = rpc_time_out) = 0;
 
+  // SendComplete tells all the server that current trainer have no more data
+  // to train, so that the pserver can reduce it's barrier count, and continue
+  // to train with other trainers.
+  virtual void SendComplete() = 0;
+
   virtual void Wait() = 0;
 
   static constexpr int64_t rpc_time_out = 120 * 1000;
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
index 448763372a8c224cc68319a4a444915896b68234..cd0fe96e2301ee3304fe9a2967df58b9f7072d8d 100644
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -43,7 +43,7 @@ void RPCServer::SavePort() const {
 
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
   std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [=] {
+  barrier_cond_.wait(lock, [this, &rpc_name] {
     return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
   });
 
@@ -53,19 +53,23 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
   VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
   int b = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++barrier_counter_[rpc_name];
-  }
-
-  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
-          << ", barrier_count:" << b << ", fan_in" << client_num_;
-
+  std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
   if (b >= client_num_) {
+    lock.unlock();
     barrier_cond_.notify_all();
+    lock.lock();
   }
 }
 
+void RPCServer::DecreaseClientNum() {
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+  }
+  barrier_cond_.notify_all();
+}
+
 void RPCServer::ResetBarrierCounter() {
   VLOG(3) << "RPCServer ResetBarrierCounter ";
   std::unique_lock<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
index f809c13c726ac2f1c60e8cf84848c4138f631b44..2e3342428cb56c34abaca655d5906668cda8f140 100644
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,7 +60,7 @@ class RPCServer {
   void SetCond(const std::string& rpc_name);
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
-
+  void DecreaseClientNum();
   void ResetBarrierCounter();
 
  protected:
@@ -79,8 +79,7 @@ class RPCServer {
   std::string bind_address_;
   std::atomic<int> exit_flag_;
   int selected_port_;
-
-  const int client_num_;
+  int client_num_;
 
   std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
   std::unordered_map<std::string, int> rpc_thread_num_;
diff --git a/paddle/fluid/operators/detail/rpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc
index f49274a7b53d04f82718887aabaffd5d33053dfe..463a7b80cfac280de5afe91ee85caaaf074cef32 100644
--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ b/paddle/fluid/operators/detail/rpc_server_test.cc
@@ -98,11 +98,17 @@ void StartServer() {
   framework::Executor exe(place);
   platform::CPUDeviceContext ctx(place);
   auto* block = AppendPrefetchBlcok(&program);
-  auto prepared = exe.Prepare(program, block->ID());
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
   InitTensorsOnServer(&scope, &place, 10);
 
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
   g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
   g_req_handler->SetDevCtx(&ctx);
   g_req_handler->SetScope(&scope);
   g_req_handler->SetExecutor(&exe);
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 0803a6035d342fefdae69297461fc78abbf18414..12364fff96c03c5f9dff23c7c00ceedd043803a6 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -66,40 +66,41 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator.
+Limited Elementwise %s Operator
 
 The equation is:
 
 $$%s$$
 
-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
-smaller than or equal to the dimensions of $X$.
+- $X$: a tensor of any dimension. 
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
-   of size 1 for $Y$ will be ignored for the consideration of subsequence.
 
+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.
 
 For case 2:
 
-$Y$ will be broadcasted to match the shape of $X$ and axis should be
-set to index of the start dimension to broadcast $Y$ onto $X$.
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
+   for broadcasting $Y$ onto $X$. 
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+   subsequence, such as shape(Y) = (2, 1) => (2).
 
-If axis is -1, it is treated as axis=rank(X)-rank(Y).
+For example:
 
-For example
   .. code-block:: python
 
     shape(X) = (2, 3, 4, 5), shape(Y) = (,)
     shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
-information. However, the output only shares the LoD information with input $X$.
+The inputs $X$ and $Y$ can carry the different LoD information. 
+But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 76106004404bb0bb108cca003869f07c43bbf62f..4d12278799f66f2fb92b7580ba0c43e845aa4d3a 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -96,19 +96,22 @@ static int64_t GetTimestamp() {
   return tp.tv_sec * 1000 + tp.tv_usec / 1000;
 }
 
-void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                  framework::ProgramDesc *program,
-                                  framework::Scope *recv_scope,
-                                  framework::BlockDesc *prefetch_block) const {
+void ListenAndServOp::RunSyncLoop(
+    framework::Executor *executor, framework::ProgramDesc *program,
+    framework::Scope *recv_scope,
+    const std::vector<int> &prefetch_block_id_list) const {
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+  std::vector<int> optimize_block_id_list;
+  for (int blkid = 1; blkid < num_blocks; ++blkid) {
+    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+                  blkid) == prefetch_block_id_list.end()) {
+      optimize_block_id_list.push_back(blkid);
+    }
   }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
   // Insert placeholder for block0 which holds current op itself.
   optimize_prepared.insert(
       optimize_prepared.begin(),
@@ -135,16 +138,17 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
     std::vector<size_t> parallel_blkids;
     parallel_blkids.push_back(1);
     double ts = GetTimestamp();
-    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
-          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-          parallel_blkids.clear();
-          last_parent_blkid = program->Block(blkid).Parent();
-        }
-        parallel_blkids.push_back(blkid);
+    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+      // skip the first optimize block because it is already in the
+      // parallel_blkids.
+      int blkid = optimize_block_id_list[i];
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                              program, recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
       }
+      parallel_blkids.push_back(blkid);
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
@@ -210,18 +214,19 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   }  // while(true)
 }
 
-static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           framework::ExecutorPrepareContext *prefetch_ctx,
-                           detail::RPCServer *rpc_server) {
+static void FillRequestCtx(
+    detail::RequestHandler *h, framework::Scope *scope,
+    platform::DeviceContext *dev_ctx, framework::Executor *executor,
+    framework::ProgramDesc *program,
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ExecutorPrepareContext>>
+        *prefetch_ctx,
+    detail::RPCServer *rpc_server) {
   h->SetScope(scope);
   h->SetDevCtx(dev_ctx);
   h->SetExecutor(executor);
   h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
+  h->SetPrefetchPreparedCtx(prefetch_ctx);
   h->SetRPCServer(rpc_server);
 }
 
@@ -255,17 +260,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             request_prefetch_handler_.get());
 
   auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
   auto *program = optimize_block->Program();
   framework::Executor executor(dev_place);
 
   // prepare for prefetch
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  std::vector<int> prefetch_block_id_list;
+  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
+
+  auto prefetch_var_name_to_block_id_str =
+      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
+  for (const auto &prefetch_var_name_and_id :
+       prefetch_var_name_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(prefetch_var_name_and_id, ':', &pieces);
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+
+    int block_id = std::stoi(pieces[1]);
+    prefetch_block_id_list.push_back(block_id);
+    block_id_to_prefetch_var_name[block_id] = pieces[0];
+  }
+
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared_ctx;
+  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
+    auto block_id = prefetch_block_id_list[i];
+    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
+    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
+  }
 
   auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, prefetch_prepared.release(),
-                     rpc_service_.get());
+                     &dev_ctx, &executor, program,
+                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());
 
   f(request_send_handler_.get());
   f(request_get_handler_.get());
@@ -283,7 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
   } else {
     RunAsyncLoop(&executor, program);
   }
@@ -309,8 +339,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
     AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                     "BlockID to run on server side.");
-    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
-                                    "prefetch block to run on server side.");
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 87952cb0e683596b2b0395890b6e25b15f74d7e2..46c3a19e20b3f2dd970a672bb99f98e83d3e25bf 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <atomic>
 #include <set>
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -30,7 +31,7 @@ namespace paddle {
 namespace operators {
 
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
-constexpr char kPrefetchBlock[] = "PrefetchBlock";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 
 void RunServer(std::shared_ptr<detail::RPCServer> service);
 
@@ -46,7 +47,7 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
-                   framework::BlockDesc* prefetch_block) const;
+                   const std::vector<int>& prefetch_block_id_list) const;
 
   void RunAsyncLoop(framework::Executor* executor,
                     framework::ProgramDesc* program) const;
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 4cc7cbc6e89b0712faf9ad9c51480bce00da15f5..ecbae3894d551186f53625a6cc9cfdb36adc8d2d 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
       : DecoratedReader(reader), batch_size_(batch_size) {
     buffer_.reserve(batch_size_);
   }
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 331224a59899b4a7d517ca4f7141fb5b8f4f5168..0a02fcdeaa5a6de97d59ddce4f58ad945aa2572a 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -22,7 +22,8 @@ namespace reader {
 
 class CustomReader : public framework::DecoratedReader {
  public:
-  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
                const std::vector<std::string>& source_var_names,
                const std::vector<std::string>& sink_var_names)
       : DecoratedReader(reader),
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index bc830a2b72e657f79f4c94e24428d38ff2b7c42e..5f35b9b3eac1d9aab8662833c6e39d12f11a0087 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
   explicit DoubleBufferReader(
-      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
     cpu_tensor_cache_.resize(kCacheSize);
     gpu_tensor_cache_.resize(kCacheSize);
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 249b0b7c6dbc8b8104bce95562e6e9b2a28c77f8..19b54110b9aeece33b8d6c73612ae0e12dbfafbd 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -21,7 +21,7 @@ namespace reader {
 
 class MultiPassReader : public framework::DecoratedReader {
  public:
-  MultiPassReader(ReaderBase* reader, int pass_num)
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index fd233be945932eee9f9a3c0c578a43d5b7cc83aa..57e8e21214b7c99e52550fe51a67c9b5201cb46f 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -23,7 +23,8 @@ namespace reader {
 
 class ShuffleReader : public framework::DecoratedReader {
  public:
-  ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
       : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
     VLOG(10) << "Create shuffle reader of " << reader_;
     if (seed_ == 0) {
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 1db70f3e9699dba604569c36dc35025dfe2c94fe..3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,7 +21,8 @@ namespace reader {
 
 class ThreadedReader : public framework::DecoratedReader {
  public:
-  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     std::lock_guard<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 4fc9aae8e36e9b43d65fab0b92ec3a2549057128..c202eed354c5f1a91e93e1c3919d1bfebc1bc401 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,17 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include "gflags/gflags.h"
 
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
+DEFINE_uint64(
+    initial_cpu_memory_in_mb, 500,
+    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+
 DEFINE_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
@@ -54,7 +59,9 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return std::min(static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                                      CpuTotalPhysicalMemory()),
+                  FLAGS_initial_cpu_memory_in_mb * 1 << 20);
 }
 
 size_t CpuMinChunkSize() {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c88fbef63cf26c671246b15ea9872da0e7a92c1a..bd5c613f8cf794df5dfeb7517ed4350f9b3b6099 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -413,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
+#ifdef PADDLE_WITH_DISTRIBUTE
+      .def("complete", &Executor::Complete)
+#endif
       .def("run",
            (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
                Executor::Run);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6719a4d7eca696a0ec7d742740dc7f3979e988ce..1a010ab3ac619d3377f6f7f2ae3d90dfa6042384 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network.
+All layers just related to the neural network. 
 """
 
 from ..layer_helper import LayerHelper
@@ -95,7 +95,6 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
-       use_cudnn=False,
        use_mkldnn=False,
        act=None,
        is_test=False,
@@ -222,6 +221,7 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed (bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
@@ -654,8 +654,9 @@ def dynamic_gru(input,
             :attr:`False`.
         gate_activation(str): The activation for update gate and reset gate.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        activation(str): The activation for candidate hidden state.
+        candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+        h_0 (Variable): The hidden output of the first time step.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
@@ -873,6 +874,13 @@ def cos_sim(X, Y):
     """
     This function performs the cosine similarity between two tensors
     X and Y and returns that as the output.
+
+    Args:
+        X (Variable): The input X.
+        Y (Variable): The input Y.
+    
+    Returns:
+        Variable: the output of cosine(X, Y).
     """
     helper = LayerHelper('cos_sim', **locals())
     out = helper.create_tmp_variable(dtype=X.dtype)
@@ -899,15 +907,15 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     unchanged.
 
     Args:
-       x(variable): The input tensor.
-       dropout_prob(float): Probability of setting units to zero.
-       is_test(bool): A flag indicating whether it is in test phrase or not.
-       seed(int): A Python integer used to create random seeds. If this
-                  parameter is set to None, a random seed is used.
-                  NOTE: If an integer seed is given, always the same output
-                  units will be dropped. DO NOT use a fixed seed in training.
-       name(str|None): A name for this layer(optional). If set None, the layer
-                    will be named automatically.
+        x (Variable): The input tensor.
+         dropout_prob (float): Probability of setting units to zero.
+        is_test (bool): A flag indicating whether it is in test phrase or not.
+        seed (int): A Python integer used to create random seeds. If this
+                    parameter is set to None, a random seed is used.
+                    NOTE: If an integer seed is given, always the same output
+                    units will be dropped. DO NOT use a fixed seed in training.
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
         Variable: A tensor variable.
@@ -1029,8 +1037,8 @@ def square_error_cost(input, label):
         * :math:`Out`: Output value, same shape with :math:`X`.
 
     Args:
-       input(Variable): Input tensor, has predictions.
-       label(Variable): Label tensor, has target labels.
+        input (Variable): Input tensor, has predictions.
+        label (Variable): Label tensor, has target labels.
 
     Returns:
         Variable: The tensor variable storing the element-wise squared error \
@@ -1059,6 +1067,7 @@ def square_error_cost(input, label):
     return square_out
 
 
+@templatedoc()
 def chunk_eval(input,
                label,
                chunk_scheme,
@@ -1067,6 +1076,18 @@ def chunk_eval(input,
     """
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
+
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): ${chunk_scheme_comment}
+        num_chunk_types (int): ${num_chunk_types_comment}
+        excluded_chunk_types (list): ${excluded_chunk_types_comment}
+    
+    Returns:
+        tuple: tuple containing: (precision, recall, f1_score,
+               num_infer_chunks, num_label_chunks,
+               num_correct_chunks)
     """
     helper = LayerHelper("chunk_eval", **locals())
 
@@ -1099,6 +1120,7 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@templatedoc()
 def sequence_conv(input,
                   num_filters,
                   filter_size=3,
@@ -1111,6 +1133,19 @@ def sequence_conv(input,
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
     in the input parameters to the function.
+
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        act (str): the activation type
+    
+    Returns:
+        Variable: output of sequence_conv
     """
 
     # FIXME(dzh) : want to unify the argument of python layer
@@ -1225,33 +1260,34 @@ def conv2d(input,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of filter. It is as same as the output
-           image channel.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d Layer. According to grouped
-           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-           the first half of the filters is only connected to the first half
-           of the input channels, while the second half of the filters is only
-           connected to the second half of the input channels. Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+        input (Variable): The input image with [N, C, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not.
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1409,7 +1445,7 @@ def sequence_pool(input, pool_type):
 
 def sequence_first_step(input):
     """
-    This funciton get the first step of sequence.
+    This function gets the first step of sequence.
 
     .. code-block:: text
 
@@ -1442,7 +1478,7 @@ def sequence_first_step(input):
 
 def sequence_last_step(input):
     """
-    This funciton get the last step of sequence.
+    This function gets the last step of sequence.
 
     .. code-block:: text
 
@@ -1486,6 +1522,22 @@ def pool2d(input,
     """
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
+
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+    
+    Returns:
+        Variable: output of pool2d layer.
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1589,7 +1641,6 @@ def batch_norm(input,
 
             hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
             hidden2 = fluid.layers.batch_norm(input=hidden1)
-
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
@@ -1717,6 +1768,7 @@ def layer_norm(input,
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`.
         act(str): Activation to be applied to the output of layer normalizaiton.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: A tensor variable with the same shape as the input.
@@ -1768,6 +1820,17 @@ def layer_norm(input,
 
 
 def beam_search_decode(ids, scores, name=None):
+    """
+    ${beam_search_decode}
+
+    Args:
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        name (str): The name of this layer. It is optional.
+    
+    Returns:
+        tuple: a tuple of two output variable: sentence_ids, sentence_scores
+    """
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1843,46 +1906,46 @@ def conv2d_transpose(input,
            W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
 
     Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of the filter. It is as same as the output
-           image channel.
-       output_size(int|tuple|None): The output image size. If output size is a
-           tuple, it must contain two integers, (image_H, image_W). This
-           parameter only works when filter_size is None.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square. None if use output size to
-           calculate filter_size.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d transpose layer. Inspired by
-           grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-           when group=2, the first half of the filters is only connected to the
-           first half of the input channels, while the second half of the
-           filters is only connected to the second half of the input channels.
-           Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                              Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+                               Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
-       Variable: The tensor variable storing the convolution transpose result.
+        Variable: The tensor variable storing the convolution transpose result.
 
     Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and
-                   groups mismatch.
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
 
     Examples:
        .. code-block:: python
@@ -2019,6 +2082,17 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
     '''
     This function implements the beam search algorithm.
+
+    Args:
+        pre_ids (Variable): ${pre_ids_comment}
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        beam_size (int): ${beam_size_comment}
+        end_id (int): ${end_id_comment}
+        level (int): ${level_comment}
+    
+    Returns:
+        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
     '''
     helper = LayerHelper('beam_search', **locals())
     score_type = scores.dtype
@@ -2521,14 +2595,14 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     slice along dimension `axis`.
 
     Args:
-       x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): The axis on which to apply normalization. If `axis < 0`,
-           the dimension to normalization is rank(X) + axis. -1 is the
-           last dimension.
-       epsilon(float): The epsilon value is used to avoid division by zero,
-           the defalut value is 1e-10.
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+        x(Variable|list): The input tensor to l2_normalize layer.
+        axis(int): The axis on which to apply normalization. If `axis < 0`,
+            the dimension to normalization is rank(X) + axis. -1 is the
+            last dimension.
+        epsilon(float): The epsilon value is used to avoid division by zero,
+            the defalut value is 1e-10.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
 
     Returns:
@@ -2741,16 +2815,13 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
     the edit distance will be divided by the length of reference string.
 
     Args:
-
         input(Variable): The indices for hypothesis strings.
-
         label(Variable): The indices for reference strings.
-
         normalized(bool): Indicated whether to normalize the edit distance by
                           the length of reference string.
-
         ignored_tokens(list of int): Tokens that should be removed before
                                      calculating edit distance.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2840,10 +2911,10 @@ def ctc_greedy_decoder(input, blank, name=None):
                          where Lp is the sum of all input sequences' length and
                          num_classes is the true number of classes. (not
                          including the blank label).
-
         blank(int): the blank label index of Connectionist Temporal
                     Classification (CTC) loss, which is in thehalf-opened
                     interval [0, num_classes + 1).
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
@@ -2880,23 +2951,23 @@ def warpctc(input, label, blank=0, norm_by_times=False):
     input tensor.
 
     Args:
-       input(Variable): (LodTensor, default: LoDTensor<float>),
-         the unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information.
-         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-         sequences' length and num_classes is the true number of classes.
-         (not including the blank label).
-       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-         of variable-length sequence, which is a 2-D Tensor with LoD
-         information. It is of the shape [Lg, 1], where Lg is th sum of
-         all labels' length.
-       blank: (int, default: 0), the blank label index of Connectionist
-         Temporal Classification (CTC) loss, which is in the
-         half-opened interval [0, num_classes + 1).
-       norm_by_times: (bool, default: false), whether to normalize
-       the gradients by the number of time-step, which is also the
-       sequence's length. There is no need to normalize the gradients
-       if warpctc layer was follewed by a mean_op.
+        input(Variable): (LodTensor, default: LoDTensor<float>),
+            the unscaled probabilities of variable-length sequences,
+            which is a 2-D Tensor with LoD information.
+            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+            sequences' length and num_classes is the true number of classes.
+            (not including the blank label).
+        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+            of variable-length sequence, which is a 2-D Tensor with LoD
+            information. It is of the shape [Lg, 1], where Lg is th sum of
+            all labels' length.
+        blank (int): default 0, the blank label index of Connectionist
+            Temporal Classification (CTC) loss, which is in the
+            half-opened interval [0, num_classes + 1).
+        norm_by_times (bool): default false, whether to normalize
+            the gradients by the number of time-step, which is also the
+            sequence's length. There is no need to normalize the gradients
+            if warpctc layer was follewed by a mean_op.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -2955,9 +3026,9 @@ def sequence_reshape(input, new_dim):
     no remainder for each sequence.
 
     Args:
-       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-                with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
+            with shape being [N, M] where M for dimension.
+        new_dim (int): New dimension which the input LoDTensor is reshaped to.
 
     Returns:
         Variable: Reshaped LoDTensor according to new dimension.
@@ -2979,7 +3050,10 @@ def sequence_reshape(input, new_dim):
     return out
 
 
-@autodoc()
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@templatedoc(op_type="nce")
 def nce(input,
         label,
         num_total_classes,
@@ -2987,6 +3061,21 @@ def nce(input,
         param_attr=None,
         bias_attr=None,
         num_neg_samples=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (int): ${sample_weight_comment}
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        num_neg_samples (int): ${num_neg_samples_comment}
+    
+    Returns:
+        Variable: output of nce layer.
+    """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
     dim = input.shape[1]
@@ -3044,8 +3133,9 @@ def transpose(x, perm, name=None):
     perm[i]-th dimension of `input`.
 
     Args:
-       input (Variable): (Tensor), A Tensor.
-       perm (list): A permutation of the dimensions of `input`.
+        x (Variable): The input Tensor.
+        perm (list): A permutation of the dimensions of `input`.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: A transposed Tensor.
@@ -3278,9 +3368,9 @@ def multiplex(inputs, index):
     row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
 
     Args:
-       inputs (list): A list of variables to gather from. All variables have the
+        inputs (list): A list of variables to gather from. All variables have the
                 same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
                 with shape [M, 1] where M is the batch size.
 
     Returns:
@@ -3479,7 +3569,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         begin(int): The first value of this counter.
         step(int): The increment step between each execution.
 
-    Returns(Variable): The global run counter.
+    Returns:
+        Variable: The global run counter.
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -3540,7 +3631,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     the corresponding dimension of x.
 
     Args:
-        input(variable): The input tensor.
+        x(variable): The input tensor.
         shape(list): The new shape. At most one dimension of the new shape can
                      be -1.
         actual_shape(variable): An optional input. If provided, reshape
@@ -3552,8 +3643,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
         inplace(bool): If this flag is set true, a new output tensor is created
                        whose data is copied from input x, otherwise the output
                        shares data with input without copying.
+        name (str): The name of this layer. It is optional.
 
-    Returns(variable): The output tensor.
+    Returns:
+        Variable: The output tensor.
 
     Examples:
         .. code-block:: python
@@ -4074,7 +4167,6 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
         name(str|None): The output variable name.
 
     Returns:
-
         ${out_comment}.
     """
 
@@ -4093,6 +4185,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
                           This is a 4-D tensor of the shape
                           (num_batches, channels, in_h, in_w).
         out_short_len(int): The length of output images' short edge.
+        resample (str): resample method, default: BILINEAR.
 
     Returns:
         out (Variable): The output is a 4-D tensor of the shape
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 5b299a0f29d2f17f7ecf3509fee341712f3c9d70..2480d4e76a1b5fd76b7dc8299c2f8fcae967145e 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -515,35 +515,38 @@ class DistributeTranspiler:
                                        grad_to_block_id, None)
 
         # process distributed lookup_table
-        prefetch_block = None
+        prefetch_var_name_to_block_id = []
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
-            prefetch_block = self._create_prefetch_block(
+            prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
         if self.has_distributed_lookup_table:
-            assert prefetch_block is not None
+            assert len(prefetch_var_name_to_block_id) > 0
         else:
-            assert prefetch_block is None
-            prefetch_block = pserver_program.global_block()
+            assert len(prefetch_var_name_to_block_id) == 0
+
+        attrs = {
+            "OptimizeBlock": pserver_program.block(1),
+            "endpoint": endpoint,
+            "Fanin": self.trainer_num,
+            "sync_mode": self.sync_mode,
+            "grad_to_block_id": grad_to_block_id
+        }
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs['prefetch_var_name_to_block_id'] \
+                = prefetch_var_name_to_block_id
 
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
             inputs={'X': recv_inputs},
             outputs={},
-            attrs={
-                "OptimizeBlock": pserver_program.block(1),
-                "endpoint": endpoint,
-                "Fanin": self.trainer_num,
-                "PrefetchBlock": prefetch_block,
-                "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id
-            })
+            attrs=attrs)
 
         pserver_program.sync_with_cpp()
         return pserver_program
@@ -608,8 +611,15 @@ class DistributeTranspiler:
     def _replace_lookup_table_op_with_prefetch(self, program,
                                                pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        self.prefetch_input_vars = None
-        self.prefetch_output_vars = None
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_input_vars = []
+
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_output_vars = []
 
         continue_search_lookup_table_op = True
         while continue_search_lookup_table_op:
@@ -623,18 +633,19 @@ class DistributeTranspiler:
                     ids_name = op.input("Ids")
                     out_name = op.output("Out")
 
-                    if self.prefetch_input_vars is None:
-                        ids_var = program.global_block().vars[ids_name[0]]
-                        self.prefetch_input_vars = self.create_splited_vars(
-                            source_var=ids_var,
-                            block=program.global_block(),
-                            tag="_prefetch_in_")
-                    if self.prefetch_output_vars is None:
-                        out_var = program.global_block().vars[out_name[0]]
-                        self.prefetch_output_vars = self.create_splited_vars(
-                            source_var=out_var,
-                            block=program.global_block(),
-                            tag="_prefetch_out_")
+                    ids_var = program.global_block().vars[ids_name[0]]
+                    prefetch_input_vars = self.create_splited_vars(
+                        source_var=ids_var,
+                        block=program.global_block(),
+                        tag="_prefetch_in_")
+                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+
+                    out_var = program.global_block().vars[out_name[0]]
+                    prefetch_output_vars = self.create_splited_vars(
+                        source_var=out_var,
+                        block=program.global_block(),
+                        tag="_prefetch_out_")
+                    self.all_prefetch_output_vars.append(prefetch_output_vars)
 
                     # insert split_ids_op
                     program.global_block().insert_op(
@@ -646,14 +657,14 @@ class DistributeTranspiler:
                                 for varname in ids_name
                             ]
                         },
-                        outputs={"Out": self.prefetch_input_vars})
+                        outputs={"Out": prefetch_input_vars})
 
                     # insert prefetch_op
                     program.global_block().insert_op(
                         index=op_index + 1,
                         type="prefetch",
-                        inputs={'X': self.prefetch_input_vars},
-                        outputs={"Out": self.prefetch_output_vars},
+                        inputs={'X': prefetch_input_vars},
+                        outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
                             RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -663,7 +674,7 @@ class DistributeTranspiler:
                     program.global_block().insert_op(
                         index=op_index + 2,
                         type="concat",
-                        inputs={'X': self.prefetch_output_vars},
+                        inputs={'X': prefetch_output_vars},
                         outputs={
                             "Out": [
                                 program.global_block().vars[varname]
@@ -709,30 +720,34 @@ class DistributeTranspiler:
                                optimize_block):
         # STEP: create prefetch block
         table_var = pserver_program.global_block().vars[self.table_name]
-        prefetch_block = pserver_program.create_block(optimize_block.idx)
-        trainer_ids = self.prefetch_input_vars[pserver_index]
-        pserver_ids = pserver_program.global_block().create_var(
-            name=trainer_ids.name,
-            type=trainer_ids.type,
-            shape=trainer_ids.shape,
-            dtype=trainer_ids.dtype)
-        trainer_out = self.prefetch_output_vars[pserver_index]
-        pserver_out = pserver_program.global_block().create_var(
-            name=trainer_out.name,
-            type=trainer_out.type,
-            shape=trainer_out.shape,
-            dtype=trainer_out.dtype)
-        prefetch_block.append_op(
-            type="lookup_sparse_table",
-            inputs={'Ids': pserver_ids,
-                    "W": table_var},
-            outputs={"Out": pserver_out},
-            attrs={
-                "is_sparse": True,  # has no effect on lookup_table op
-                "is_distributed": True,
-                "padding_idx": -1
-            })
-        return prefetch_block
+        prefetch_var_name_to_block_id = []
+        for index in range(len(self.all_prefetch_input_vars)):
+            prefetch_block = pserver_program.create_block(optimize_block.idx)
+            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
+            pserver_ids = pserver_program.global_block().create_var(
+                name=trainer_ids.name,
+                type=trainer_ids.type,
+                shape=trainer_ids.shape,
+                dtype=trainer_ids.dtype)
+            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
+            pserver_out = pserver_program.global_block().create_var(
+                name=trainer_out.name,
+                type=trainer_out.type,
+                shape=trainer_out.shape,
+                dtype=trainer_out.dtype)
+            prefetch_block.append_op(
+                type="lookup_sparse_table",
+                inputs={'Ids': pserver_ids,
+                        "W": table_var},
+                outputs={"Out": pserver_out},
+                attrs={
+                    "is_sparse": True,  # has no effect on lookup_table op
+                    "is_distributed": True,
+                    "padding_idx": -1
+                })
+            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+                prefetch_block.idx))
+        return prefetch_var_name_to_block_id
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
                                      pre_block_idx, grad_to_block_id):
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 48100e5bf989520043b5ca372b02883faea8a9fd..54a690462699651d3e14f9b24383df01a9740336 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -126,9 +126,10 @@ class DocstringChecker(BaseChecker):
         'W9002':
         ('Doc string does not end with "." period', symbol + "-end-with",
          'Used when a doc string does not end with a period'),
-        'W9003': ('All args with their types must be mentioned in doc string',
-                  symbol + "-with-all-args",
-                  'Used when not all arguments are in the doc string '),
+        'W9003':
+        ('All args with their types must be mentioned in doc string %s',
+         symbol + "-with-all-args",
+         'Used when not all arguments are in the doc string '),
         'W9005': ('Missing docstring or docstring is too short',
                   symbol + "-missing", 'Add docstring longer >=10'),
         'W9006': ('Docstring indent error, use 4 space for indent',
@@ -178,6 +179,8 @@ class DocstringChecker(BaseChecker):
         self.indent_style(node)
 
     def missing_doc_string(self, node):
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
         if node.tolineno - node.fromlineno <= 10:
             return True
 
@@ -199,12 +202,16 @@ class DocstringChecker(BaseChecker):
 
         doc = node.doc
         lines = doc.splitlines()
+        line_num = 0
 
         for l in lines:
+            if line_num == 0:
+                continue
             cur_indent = len(l) - len(l.lstrip())
             if cur_indent % indent != 0:
                 self.add_message('W9006', node=node, line=node.fromlineno)
                 return False
+            line_num += 1
 
         return True
 
@@ -320,15 +327,19 @@ class DocstringChecker(BaseChecker):
             return True
 
         parsed_args = doc.args
+        args_not_documented = set(args) - set(parsed_args)
         if len(args) > 0 and len(parsed_args) <= 0:
-            print "debug:parsed args: ", parsed_args
-            self.add_message('W9003', node=node, line=node.fromlineno)
+            self.add_message(
+                'W9003',
+                node=node,
+                line=node.fromlineno,
+                args=list(args_not_documented))
             return False
 
         for t in args:
             if t not in parsed_args:
-                print t, " with (type) not in ", parsed_args
-                self.add_message('W9003', node=node, line=node.fromlineno)
+                self.add_message(
+                    'W9003', node=node, line=node.fromlineno, args=[t, ])
                 return False
 
         return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
index e7c92ba671e0eb778b2ab5447bea7c4b14fe761b..150a3f5666bd39d30b7e6518e58a14fb5fe2f14b 100755
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export PYTHONPATH=$DIR:$PYTHONPATH
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
     pylint --disable=all --load-plugins=docstring_checker \
     --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
     TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done
 
-#exit $TOTAL_ERRORS
+exit $TOTAL_ERRORS
 #For now, just warning:
-exit 0
+#exit 0