Merge remote-tracking branch 'ups/develop' into refine/infershape

b0b5f515 · tensor-tang · d61c1176 · 08cfe27c · b0b5f515 · b0b5f515
13 changed file
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -442,8 +442,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  use_gpu = nccl_ctxs_ != nullptr;
 #endif
-  if (use_gpu ||
+  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
    // Insert BCast Ops
    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
      auto &to_bcast_set = bcast_var_name_set[dev_id];

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
    buffer.Resize(sizeof(T) * data.size());
  }
-  std::memcpy(buffer.data(), data.data(), buffer.length());
+  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
  // copy LoD
  for (const auto &level : fetch.lod()) {
    output->lod.emplace_back(level);

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -117,34 +117,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({input_tensor});
 }
-void BenchAllData(const std::string &model_path, const std::string &data_file,
-                  const int batch_size, const int repeat) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  GetOneBatch(&input_slots, &data, batch_size);
-  for (int i = 0; i < FLAGS_burning; i++) {
-    predictor->Run(input_slots, &outputs_slots);
-  }
-  Timer timer;
-  double sum = 0;
-  for (int i = 0; i < repeat; i++) {
-    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      GetOneBatch(&input_slots, &data, batch_size);
-      timer.tic();
-      predictor->Run(input_slots, &outputs_slots);
-      sum += timer.toc();
-    }
-  }
-  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-}
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,6 +20,7 @@ if(WITH_GRPC)
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+  cc_test(varhandle_test SRCS varhandle_test.cc)
  return()
 endif()

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() {
    }
    channels_.clear();
  }
  client_thread_->join();
 }
-bool GRPCClient::AsyncSendVar(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+                                      const std::string& var_name,
+                                      int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
+  SendProcessor* s = new SendProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
-  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
-                      this] {
    auto* var = p_scope->FindVar(var_name_val);
    ::grpc::ByteBuffer req;
    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
-    // varhandle
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Send";
-    VLOG(3) << var_h.String() << " begin";
    // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
    s->response_call_back_ = nullptr;
    auto call = s->stub_g_.PrepareUnaryCall(
@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
  });
  req_count_++;
-  return true;
+  return h;
 }
 void ProcGetResponse(const VarHandle& var_h,
                     const ::grpc::ByteBuffer& ret_msg) {
  framework::Variable* outvar = nullptr;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
 }
 template <typename T>
@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
  result->Swap(&tmp);
 }
-bool GRPCClient::AsyncGetVar(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                     const platform::DeviceContext& ctx,
                                     const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+                                     const std::string& var_name,
+                                     int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
-  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
-                      this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
    ::grpc::ByteBuffer buf;
    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-    // var handle
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Get";
-    VLOG(3) << var_h.String() << " begin";
    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
    s->response_call_back_ = ProcGetResponse;
    auto call = s->stub_g_.PrepareUnaryCall(
@@ -160,10 +145,10 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
  req_count_++;
-  return true;
+  return h;
 }
-bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
                                          const platform::DeviceContext& ctx,
                                          const framework::Scope& scope,
                                          const std::string& in_var_name,
@@ -175,27 +160,21 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
  const std::string out_var_name_val = out_var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(
+      new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {
+                      time_out, s, this] {
    auto* var = p_scope->FindVar(in_var_name_val);
    ::grpc::ByteBuffer req;
    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
-    // var handle
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = out_var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Prefetch";
-    VLOG(3) << var_h.String() << " begin";
    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
    s->response_call_back_ = ProcGetResponse;
    auto call = s->stub_g_.PrepareUnaryCall(
@@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
  });
  req_count_++;
-  return true;
+  return h;
 }
-void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                               int64_t time_out) {
  const auto ch = GetChannel(ep);
  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+  return h;
 }
-void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                               int64_t time_out) {
  const auto ch = GetChannel(ep);
  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+  return h;
 }
-void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
  const auto ch = GetChannel(ep);
  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(
+      new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
  sendrecv::VariableMessage req;
  req.set_varname(COMPLETE_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+  return h;
 }
-void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                               const std::string& dir,
                                               int64_t time_out) {
  const auto ch = GetChannel(ep);
  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
  sendrecv::VariableMessage req;
  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
@@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+  return h;
 }
 bool GRPCClient::Wait() {
@@ -276,25 +268,28 @@ void GRPCClient::Proceed() {
  void* tag = nullptr;
  bool ok = false;
+  VLOG(3) << "GRPCClient Proceed begin";
  while (!stopped_ && cq_.Next(&tag, &ok)) {
    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
    if (c->status_.ok()) {
-      VLOG(3) << c->var_h_.String() << " process";
+      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
      c->Process();
    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      LOG(ERROR) << c->var_h_.String()
+      LOG(ERROR) << c->GetVarHandlePtr()->String()
                 << " meets grpc error:" << c->status_.error_message();
      {
        std::lock_guard<std::mutex> lk(sync_mutex_);
        ok_ = false;
      }
-      sync_cond_.notify_all();
+      c->Finish(false);
    } else {
-      LOG(FATAL) << c->var_h_.String()
+      LOG(FATAL) << c->GetVarHandlePtr()->String()
                 << " meets grpc error:" << c->status_.error_message();
+      c->Finish(false);
    }
    delete c;
    {
      std::lock_guard<std::mutex> lk(sync_mutex_);
@@ -302,6 +297,7 @@ void GRPCClient::Proceed() {
    }
    sync_cond_.notify_all();
  }
+  VLOG(3) << "GRPCClient Proceed end";
 }
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 class BaseProcessor {
 public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+  BaseProcessor() { context_ = nullptr; }
-    context_ = nullptr;
-  }
  virtual ~BaseProcessor() {}
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
+    var_h_ = h;
    context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
    context_->set_wait_for_ready(true);
    if (time_out) {
      std::chrono::system_clock::time_point deadline =
@@ -71,21 +70,21 @@ class BaseProcessor {
    }
  }
-  virtual void Prepare(int64_t time_out) {
+  void Process() {
-    context_.reset(new grpc::ClientContext());
+    ProcessImpl();
-    context_->set_wait_for_ready(true);
+    var_h_->Finish(true);
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-    context_->set_deadline(deadline);
  }
-  virtual void Process() = 0;
+  VarHandlePtr GetVarHandlePtr() { return var_h_; }
+  bool Wait() { return var_h_->Wait(); }
+  void Finish(bool ok) { return var_h_->Finish(ok); }
+  virtual void ProcessImpl() = 0;
  std::unique_ptr<grpc::ClientContext> context_;
  grpc::Status status_;
-  VarHandle var_h_;
+ protected:
+  VarHandlePtr var_h_;
 };
 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class SendProcessor : public BaseProcessor {
 public:
  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
  virtual ~SendProcessor() {}
-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }
@@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class GetProcessor : public BaseProcessor {
 public:
  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
  virtual ~GetProcessor() {}
-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }
@@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
 class BatchBarrierProcessor : public BaseProcessor {
 public:
  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~BatchBarrierProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
 class FetchBarrierProcessor : public BaseProcessor {
 public:
  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~FetchBarrierProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VariableMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
 class CheckpointNotifyProcessor : public BaseProcessor {
 public:
  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~CheckpointNotifyProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
  virtual ~GRPCClient();
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+  VarHandlePtr AsyncSendVar(const std::string& ep,
-                    const framework::Scope& scope, const std::string& var_name,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+  VarHandlePtr AsyncGetVar(const std::string& ep,
-                   const framework::Scope& scope, const std::string& var_name,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
                           int64_t time_out = FLAGS_rpc_deadline) override;
-  bool AsyncPrefetchVar(const std::string& ep,
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
                                int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendBatchBarrier(const std::string& ep,
+  VarHandlePtr AsyncSendBatchBarrier(
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendFetchBarrier(const std::string& ep,
+  VarHandlePtr AsyncSendFetchBarrier(
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendComplete(const std::string& ep,
+  VarHandlePtr AsyncSendComplete(
-                         int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
  bool Wait() override;

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/macros.h"
 namespace paddle {
 namespace operators {
@@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 class RPCServer;
-struct VarHandle {
+class VarHandle {
-  // RPC endpoint.
+ public:
-  std::string ep;
+  VarHandle(const std::string ep, const std::string& method,
-  const platform::DeviceContext* ctx;
+            const std::string& name,
-  const framework::Scope* scope;
+            const platform::DeviceContext* p_ctx = nullptr,
-  // Variable name.
+            const framework::Scope* p_scope = nullptr)
-  std::string name;
+      : ok_(kVarHandleDefaultState) {
-  // RPC method name.
+    ep_ = ep;
-  std::string method;
+    ctx_ = p_ctx;
+    scope_ = p_scope;
+    name_ = name;
+    method_ = method;
+  }
+  virtual ~VarHandle() {}
+ public:
+  bool Wait() {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
+    }
+    VLOG(7) << "VarHandle wait:" << ok_;
+    return ok_ != 0;
+  }
+  void Finish(bool ok) {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      ok_ = ok;
+    }
+    VLOG(7) << "VarHandle finish:" << ok;
+    wait_cond_.notify_all();
+  }
  std::string String() const {
    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
+      << "]";
    return s.str();
  }
+  std::string ep() const { return ep_; }
+  const platform::DeviceContext* ctx() const { return ctx_; }
+  const framework::Scope* scope() const { return scope_; }
+  std::string name() const { return name_; }
+  std::string method() const { return method_; }
+ protected:
+  // RPC endpoint.
+  std::string ep_;
+  const platform::DeviceContext* ctx_;
+  const framework::Scope* scope_;
+  // Variable name.
+  std::string name_;
+  // RPC method name.
+  std::string method_;
+ protected:
+  std::mutex sync_mutex_;
+  std::condition_variable wait_cond_;
+  int ok_;
+  static const int kVarHandleDefaultState = -1;
+ private:
+  DISABLE_COPY_AND_ASSIGN(VarHandle);
 };
+typedef std::shared_ptr<VarHandle> VarHandlePtr;
 class RequestHandler {
 public:
  explicit RequestHandler(bool sync_mode)

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -14,12 +14,14 @@
 #pragma once
+#include <condition_variable>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 DECLARE_int32(rpc_deadline);
@@ -31,37 +33,36 @@ class RPCClient {
 public:
  RPCClient() {}
  virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
+  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual bool AsyncGetVar(const std::string& ep,
+  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
                                   const platform::DeviceContext& ctx,
                                   const framework::Scope& scope,
                                   const std::string& var_name,
                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual bool AsyncPrefetchVar(const std::string& ep,
+  virtual VarHandlePtr AsyncPrefetchVar(
-                                const platform::DeviceContext& ctx,
+      const std::string& ep, const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
+      const framework::Scope& scope, const std::string& in_var_name,
-                                const std::string& in_var_name,
      const std::string& out_var_name,
      int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
+  virtual VarHandlePtr AsyncSendBatchBarrier(
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
+  virtual VarHandlePtr AsyncSendFetchBarrier(
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncCheckpointNotify(const std::string& ep,
+  virtual VarHandlePtr AsyncCheckpointNotify(
-                                     const std::string& dir,
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendComplete(const std::string& ep,
+  virtual VarHandlePtr AsyncSendComplete(
-                                 int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
  // Complete tells all the pserver instances that finishe the training,
  // the pserver can reduce it's barrier count, and continue to train

--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+using paddle::operators::distributed::VarHandlePtr;
+using paddle::operators::distributed::VarHandle;
+void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
+void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
+TEST(VarHandle, Run) {
+  std::vector<VarHandlePtr> a;
+  for (int i = 0; i < 12; i++) {
+    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
+    a.push_back(s);
+  }
+  std::vector<std::unique_ptr<std::thread>> t;
+  for (int i = 0; i < 6; i++) {
+    t.emplace_back(new std::thread(WaitFalse, a[i]));
+  }
+  for (int i = 0; i < 6; i++) {
+    a[i]->Finish(false);
+    t[i]->join();
+  }
+  for (int i = 6; i < 12; i++) {
+    t.emplace_back(new std::thread(WaitTrue, a[i]));
+  }
+  for (int i = 6; i < 12; i++) {
+    a[i]->Finish(true);
+    t[i]->join();
+  }
+}
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                << outs[i] << " back";
-        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
+                                                    ins[i], outs[i]));
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
-    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
  }
 };

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < outs.size(); i++) {
      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
    }
    if (sync_mode) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
    }
  }
 };

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
+        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
    if (sync_send) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
    }
  }
 };

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -128,6 +128,13 @@ class ParallelExecutor(object):
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                exec_strategy.num_threads = cpu_num * 2
+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
        if build_strategy is None:
            build_strategy = BuildStrategy()