Merge remote-tracking branch 'ups/develop' into refine/jit

adc7ba2e · tensor-tang · 900c789a · 1213e283 · adc7ba2e · adc7ba2e
114 changed file
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
   We appreciate your contributions!

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -66,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
  CheckInit();
  for (size_t i = 0; i < use_slots_.size(); ++i) {
    if (name == use_slots_[i]) {
-      if (use_slots_is_dense_[i]) {
+      feed_vec_[i] = var->GetMutable<LoDTensor>();
-        feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
-      } else {
-        feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
-      }
    }
  }
 }
@@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
          "the data, please check if the data contains unresolvable "
          "characters.\nplease check this error line: %s",
          str);
      if (idx != -1) {
        (*instance)[idx].Init(all_slots_type_[i]);
        if ((*instance)[idx].GetType()[0] == 'f') {  // float
@@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec(
      (*ins_vec)[i].InitOffset();
    }
  }
  for (size_t i = 0; i < instance.size(); ++i) {
    (*ins_vec)[i].AddIns(instance[i]);
  }
@@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec(
    const auto& type = ins_vec[i].GetType();
    const auto& offset = ins_vec[i].GetOffset();
    int total_instance = static_cast<int>(offset.back());
    if (type[0] == 'f') {  // float
      const auto& feasign = ins_vec[i].GetFloatData();
-      if (feed_vec_[i].IsDense()) {
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
-        int size_in_each_batch = total_instance / batch_size_;
+          {total_instance, 1}, platform::CPUPlace());
-        float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-            {batch_size_, size_in_each_batch}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-      } else {
-        float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
-            {total_instance, 1}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-        LoD data_lod{offset};
-        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
-      }
    } else if (type[0] == 'u') {  // uint64
      // no uint64_t type in paddlepaddle
      const auto& feasign = ins_vec[i].GetUint64Data();
-      if (feed_vec_[i].IsDense()) {
+      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-        int size_in_each_batch = total_instance / batch_size_;
+          {total_instance, 1}, platform::CPUPlace());
-        int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
-            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+    }
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
-      } else {
+    LoD data_lod{offset};
-        int64_t* tensor_ptr =
+    feed_vec_[i]->set_lod(data_lod);
-            feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
+    if (use_slots_is_dense_[i]) {
-                {total_instance, 1}, platform::CPUPlace());
+      int dim = total_instance / batch_size_;
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      feed_vec_[i]->Resize({batch_size_, dim});
-        LoD data_lod{offset};
-        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
-      }
    }
  }
 }

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -30,35 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-// Pack Tensor type and LoDTensor type into MixTensor type, in order
-// to record either Tensor or LoDTensor information at the same time.
-class MixTensor {
- public:
-  MixTensor() {}
-  explicit MixTensor(LoDTensor* lodtensor) {
-    is_dense_ = false;
-    lodtensor_ = lodtensor;
-  }
-  explicit MixTensor(Tensor* tensor) {
-    is_dense_ = true;
-    tensor_ = tensor;
-  }
-  bool IsDense() { return is_dense_; }
-  LoDTensor* GetLoDTensor() {
-    PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
-    return lodtensor_;
-  }
-  Tensor* GetTensor() {
-    PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
-    return tensor_;
-  }
- private:
-  bool is_dense_;
-  LoDTensor* lodtensor_;
-  Tensor* tensor_;
-};
 // DataFeed is the base virtual class for all ohther DataFeeds.
 // It is used to read files and parse the data for subsequent trainer.
 // Example:
@@ -133,7 +104,7 @@ class DataFeed {
      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
  // The data read by DataFeed will be stored here
-  std::vector<MixTensor> feed_vec_;
+  std::vector<LoDTensor*> feed_vec_;
  // the batch size defined by user
  int default_batch_size_;

--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
      std::map<std::string, const paddle::framework::LoDTensor*>
          lodtensor_targets;
-      std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
        const auto& slot = multi_slot_desc.slots(i);
        if (slot.is_used()) {
          const auto& name = slot.name();
          readers[idx]->AddFeedVar(scope->Var(name), name);
-          if (slot.is_dense()) {
+          lodtensor_targets[name] =
-            tensor_targets[name] =
+              &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
-                &scope->FindVar(name)->Get<paddle::framework::Tensor>();
-          } else {
-            lodtensor_targets[name] =
-                &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
-          }
        }
      }
      readers[idx]->Start();
@@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
          if (!slot.is_used()) {
            continue;
          }
+          const paddle::framework::LoDTensor* tens =
+              lodtensor_targets[slot.name()];
          if (slot.is_dense()) {  // dense branch
-            const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
            if (slot.type() == "uint64") {
              const int64_t* data = tens->data<int64_t>();
              int batch_size = tens->dims()[0];
@@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
              PADDLE_THROW("Error type in proto file.");
            }
          } else {  // sparse branch
-            const paddle::framework::LoDTensor* tens =
-                lodtensor_targets[slot.name()];
            if (slot.type() == "uint64") {
              const int64_t* data = tens->data<int64_t>();
              for (size_t i = 0; i < tens->NumElements(); ++i) {

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  auto out_format =
      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-  void* in_data = GetDataFromTensor(in, in_type);
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());
-  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-  auto out_memory =
-      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
-  platform::Reorder(in_memory, out_memory);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
  out->set_format(memory::format::format_undef);

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
-    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    if(WITH_DISTRIBUTE)
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+    else()
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor)
+    endif()
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 else()
    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             variable_visitor)
-    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    if(WITH_DISTRIBUTE)
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor sendrecvop_grpc)
+    else()
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor)
+    endif()
    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -58,6 +58,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      }
    }
+    CollectiveContext *context = CollectiveContext::GetInstance();
+    context->endpoints_ = strategy_.trainers_endpoints_;
+    context->trainer_id_ = strategy_.trainer_id_;
+    PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
+    if (strategy_.trainer_id_ > 0) {
+      PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
+                         strategy_.trainers_endpoints_.size(),
+                     "trainer_id_ < endpoints_ size");
+    }
+    VLOG(1) << "CollectiveContext:" << context->String();
    // Convert graph to run on multi-devices.
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
@@ -135,16 +146,16 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "sequential_execution_pass") {
-      VLOG(1) << "set enable_sequential_execution:"
+      LOG(INFO) << "set enable_sequential_execution:"
-              << enable_sequential_execution_;
+                << enable_sequential_execution_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
-      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
-              << ", num_trainers:" << num_trainers_;
+                << ", num_trainers:" << num_trainers_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
  bool fuse_broadcast_op_{false};
  int num_trainers_{1};
+  int trainer_id_{0};
+  std::vector<std::string> trainers_endpoints_;
  bool remove_unnecessary_lock_{false};
  // NOTE:

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,7 +53,7 @@ struct ReduceLoDTensor {
  }
 };
-inline void GatherSelectedRows(
+inline void GatherLocalSelectedRows(
    const std::vector<const SelectedRows *> &src_selecte_rows_,
    const std::vector<platform::Place> &in_places,
    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,6 +16,12 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#endif
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_bool(
@@ -26,6 +32,112 @@ namespace paddle {
 namespace framework {
 namespace details {
+std::once_flag CollectiveContext::init_flag_;
+std::unique_ptr<CollectiveContext> CollectiveContext::context_;
+static inline std::string GetRemoteVarName(const std::string &var_name,
+                                           int trainer_id) {
+  return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
+}
+void ReduceOpHandle::Wait(
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
+  // TODO(gongwb): use event wait?
+  for (auto &dev_ctx : dev_ctxes) {
+    dev_ctx.second->Wait();
+  }
+}
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+template <typename DevCtx, typename DataType>
+void ReduceOpHandle::GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selected_rows,
+    const std::vector<platform::Place> &in_places,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+    VarHandle *out_var_handle, const platform::Place &out_place,
+    SelectedRows *dst_selected_rows) {
+  const CollectiveContext &collective_context =
+      *CollectiveContext::GetInstance();
+  // 1. gather local selected rows, merge them
+  std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp";
+  auto scope = local_scopes_.at(out_var_handle->scope_idx_);
+  auto gathered_var_mid = scope->Var(gathered_var_name);
+  auto gathered_select_rows =
+      gathered_var_mid->GetMutable<framework::SelectedRows>();
+  GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
+                          gathered_select_rows);
+  // FIXME(gongwb): remove this Wait.
+  Wait(dev_ctxes);
+  // merge them
+  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
+  std::string merged_var_name =
+      GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_);
+  auto merged_select_rows =
+      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
+  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
+  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
+  // 2. start collective server if it doesn't exist
+  operators::distributed::CollectiveServer *server =
+      operators::distributed::CollectiveServer::GetInstance(
+          collective_context.endpoints_[collective_context.trainer_id_],
+          collective_context.endpoints_.size() - 1);
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar(merged_var_name,
+                          operators::distributed::kRequestGetMonomerVariable,
+                          scope, merged_dev_ctx);
+  // 3. gather them from all remote nodes.
+  std::vector<const SelectedRows *> remote;
+  operators::distributed::CollectiveClient *client =
+      operators::distributed::CollectiveClient::GetInstance();
+  std::vector<operators::distributed::RemoteVar> vars;
+  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
+    if (i == (unsigned)collective_context.trainer_id_) continue;
+    operators::distributed::RemoteVar var;
+    var.trainer_id_ = i;
+    var.var_name_ = GetRemoteVarName(out_var_handle->name_, i);
+    var.ep_ = collective_context.endpoints_[i];
+    vars.push_back(var);
+    VLOG(4) << "gather from:" << var.String();
+  }
+  // erase gathered vars
+  merged_dev_ctx->Wait();
+  scope->EraseVars(std::vector<std::string>{gathered_var_name});
+  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
+  PADDLE_ENFORCE(remote.size() == vars.size());
+  // 4. merged local selected rows.
+  std::vector<const SelectedRows *> all;
+  all.resize(collective_context.endpoints_.size());
+  for (auto v : vars) {
+    all[v.trainer_id_] =
+        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
+  }
+  all[collective_context.trainer_id_] = merged_select_rows;
+  merge_func(*merged_dev_ctx, all, dst_selected_rows);
+  rpc_server->WaitVarBarrier(merged_var_name);
+  rpc_server->ClearVar(merged_var_name);
+  // 5. clear mid vars
+  std::vector<std::string> tmp_vars{merged_var_name};
+  for (auto r : vars) {
+    tmp_vars.push_back(r.var_name_);
+  }
+  scope->EraseVars(tmp_vars);
+}
+#endif
 void ReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
@@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() {
    this->RunAndRecordEvent([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                         out_var->GetMutable<framework::SelectedRows>());
+      const CollectiveContext &collective_context =
+          *CollectiveContext::GetInstance();
+      VLOG(10) << "GatherSelectedRows CollectiveContext:"
+               << collective_context.String();
+      // TODO(gongwb): add cpu support
+      if (collective_context.endpoints_.size() <= 1 ||
+          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+        GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                                t_out_p,
+                                out_var->GetMutable<framework::SelectedRows>());
+        return;
+      }
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, float>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else if (framework::IsType<const double>(
+                     in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, double>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else {
+        PADDLE_ENFORCE(false,
+                       "only support double or float when gahter SelectedRows");
+      }
+#endif
    });
  } else {
    std::vector<const LoDTensor *> lod_tensors =

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -30,6 +30,32 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct CollectiveContext {
+  std::vector<std::string> endpoints_;
+  int trainer_id_{0};
+  std::string String() const {
+    std::stringstream ss;
+    ss << "endpoints_:";
+    for (auto e : endpoints_) {
+      ss << e << ",";
+    }
+    ss << "trainer_id_:" << trainer_id_;
+    return ss.str();
+  }
+  static CollectiveContext *GetInstance() {
+    std::call_once(init_flag_,
+                   [&]() { context_.reset(new CollectiveContext()); });
+    return context_.get();
+  }
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveContext> context_;
+};
 struct ReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
@@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+  template <typename DevCtx, typename DataType>
+  void GatherSelectedRows(
+      const std::vector<const SelectedRows *> &src_selecte_rows_,
+      const std::vector<platform::Place> &in_places,
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+      VarHandle *out_var_handle, const platform::Place &out_place,
+      SelectedRows *dst_selecte_rows);
+#endif
+  void Wait(
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
  template <typename T>
  std::vector<const T *> GetInputValues(
      const std::vector<VarHandle *> &in_var_handles,

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() {
  static unsigned concurrency_cap = std::thread::hardware_concurrency();
  int thread_id = this->thread_id_;
-  if (thread_id < concurrency_cap) {
+  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
    unsigned proc = thread_id;
    cpu_set_t mask;

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace framework {
@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
  return tensor;
 }
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index);
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -46,14 +46,16 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
+  std::string type = is_conv3d() ? "conv3d" : "conv2d";
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
+          ->assert_is_op_input(type, "Input");
  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input);
+  conv_bias_pattern(conv_input, is_conv3d());
  int found_conv_bias_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
@@ -109,7 +111,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType("conv2d");
+      desc.SetType(type);
      for (auto& attr : conv->Op()->GetAttrMap()) {
        desc.SetAttr(attr.first, attr.second);
@@ -135,3 +137,5 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
              paddle::framework::ir::ConvBiasFusePass);
+REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv3DBiasFusePass);
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -26,11 +26,19 @@ namespace ir {
 class ConvBiasFusePass : public FusePassBase {
 public:
  virtual ~ConvBiasFusePass() {}
+  virtual bool is_conv3d() const { return false; }
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
+/*
+* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+*/
+class Conv3DBiasFusePass : public ConvBiasFusePass {
+ public:
+  bool is_conv3d() const override { return true; }
+};
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
-              << "Cannot add backward operator before forward operator %s."
+                     << op->Type();
-              << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
 }
 PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
+    paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
+  std::string type = is_conv3d ? "conv3d" : "conv2d";
  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
+  conv_input->assert_is_op_input(type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
  auto *eltiwse_op =
      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
  // Create variables
@@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()(
  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                              ->AsInput()
                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
+                              ->assert_is_op_input(type, "Filter");
  // intermediate variable, will be removed in the IR after fuse.
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_only_output_of_op(type)
                           ->assert_is_op_input("elementwise_add");
  // Bias stored in elementwise_add
  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
 struct ConvBias : public PatternBase {
  ConvBias(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input);
+  PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
  // declare operator node's name
  PATTERN_DECL_NODE(conv);
  PATTERN_DECL_NODE(eltwise);

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (n->RuntimeHasAttr("is_test")) {
+      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {

--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "conv3") {
-        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
+        ASSERT_FALSE(op->HasAttr("is_test"));
      } else {
-        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
+        ASSERT_TRUE(op->HasAttr("is_test"));
        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
      }
    }

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include <string>
 namespace paddle {
 namespace framework {
@@ -21,9 +22,19 @@ namespace ir {
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
+    if (n->IsOp()) {
-      n->Op()->SetAttr("use_mkldnn", true);
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
+      }
    }
  }
  return graph;
@@ -33,5 +44,5 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
-REGISTER_PASS(mkldnn_placement_pass,
+REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
-              paddle::framework::ir::MKLDNNPlacementPass);
+    .RequirePassAttr("mkldnn_enabled_op_types");
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,28 +30,6 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
  return std::unique_ptr<Node>(new Node(name, type));
 }
-bool Node::RuntimeHasAttr(const std::string &name) const {
-  if (Op()->HasAttr(name)) {
-    return true;
-  } else {
-    auto &op_info = OpInfoMap::Instance();
-    auto op_type = Op()->Type();
-    if (op_info.Has(op_type)) {
-      auto op_info_ptr = op_info.Get(op_type);
-      if (op_info_ptr.HasOpProtoAndChecker()) {
-        const proto::OpProto &proto = op_info_ptr.Proto();
-        for (int i = 0; i != proto.attrs_size(); ++i) {
-          const proto::OpProto::Attr &attr = proto.attrs(i);
-          if (attr.name() == name) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,18 +108,6 @@ class Node {
           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
  }
-  // RuntimeHasAttr is different with HasAttr now.
-  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
-  // thus, if stored program_desc_ are old which don't have an attr, a new
-  // library which adds the attr already will fail on this function.
-  // Details:
-  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
-  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
-  // problem.
-  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
-  // RuntimeHasAttr.
-  bool RuntimeHasAttr(const std::string& name) const;
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string &param_name,
  this->outputs_[param_name] = args;
 }
+bool OpDesc::HasProtoAttr(const std::string &name) const {
+  auto &op_info = OpInfoMap::Instance();
+  if (op_info.Has(desc_.type())) {
+    auto op_info_ptr = op_info.Get(desc_.type());
+    if (op_info_ptr.HasOpProtoAndChecker()) {
+      const proto::OpProto &proto = op_info_ptr.Proto();
+      for (int i = 0; i != proto.attrs_size(); ++i) {
+        const proto::OpProto::Attr &attr = proto.attrs(i);
+        if (attr.name() == name) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,8 @@ class OpDesc {
    return attrs_.find(name) != attrs_.end();
  }
+  bool HasProtoAttr(const std::string &name) const;
  proto::AttrType GetAttrType(const std::string &name) const;
  std::vector<std::string> AttrNames() const;

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
+cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(engine SRCS engine.cc)
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/imperative/engine.h"
+#include <mutex>  // NOLINT
+#include <vector>
+#include "glog/logging.h"
+namespace paddle {
+namespace imperative {
+static std::once_flag init_engine;
+static Engine* engine;
+class DummyEngine : public Engine {
+ public:
+  void Enqueue(Runnable* runnable) override {
+    queued_runnables_.push_back(runnable);
+  }
+  size_t Size() const override { return queued_runnables_.size(); }
+  void Sync() override {
+    for (Runnable* l : queued_runnables_) {
+      LOG(INFO) << "running " << reinterpret_cast<void*>(l);
+    }
+    queued_runnables_.clear();
+  }
+ private:
+  std::vector<Runnable*> queued_runnables_;
+};
+Engine* GetEngine() {
+  std::call_once(init_engine, []() { engine = new DummyEngine(); });
+  return engine;
+}
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/engine.h
+++ b/paddle/fluid/imperative/engine.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cstddef>
+#include <cstdint>
+namespace paddle {
+namespace imperative {
+struct Runnable {};
+class Engine {
+ public:
+  virtual ~Engine() {}
+  virtual void Enqueue(Runnable* runnable) = 0;
+  virtual size_t Size() const = 0;
+  virtual void Sync() = 0;
+};
+Engine* GetEngine();
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/imperative/layer.h"
+#include <deque>
+#include <limits>
+#include <map>
+#include <random>
+#include <utility>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace imperative {
+using framework::Variable;
+void AddTo(Variable* src, Variable* dst) {
+  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
+                 dst_tensor->numel(), src_tensor->numel());
+  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
+  const float* src_data = src_tensor->data<float>();
+  for (size_t i = 0; i < src_tensor->numel(); ++i) {
+    dst_data[i] += src_data[i];
+  }
+}
+class Autograd {
+ public:
+  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+  void RunBackward(VarBase* var) {
+    PADDLE_ENFORCE(var->pre_op_->op_desc_);
+    // TODO(panyx0718): Only create for vars that "require_grad"
+    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+    std::deque<OpBase*> ready;
+    ready.push_back(var->pre_op_);
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
+    while (!ready.empty()) {
+      OpBase* ready_op = ready.front();
+      ready.pop_front();
+      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
+      for (size_t i = 0; i < input_grads.size(); ++i) {
+        if (!input_grads[i]) continue;
+        OpBase* pre_op = ready_op->pre_ops_->at(i);
+        if (!pre_op) continue;
+        dep_counts[pre_op] -= 1;
+        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+        bool pre_op_ready = dep_counts[pre_op] == 0;
+        if (pre_op_ready) {
+          ready.push_back(pre_op);
+        }
+      }
+    }
+  }
+ private:
+  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
+    std::map<OpBase*, int> ret;
+    std::deque<OpBase*> queue;
+    queue.push_back(op);
+    std::unordered_set<OpBase*> visited;
+    visited.insert(op);
+    while (!queue.empty()) {
+      OpBase* candidate = queue.front();
+      queue.pop_front();
+      for (OpBase* pre_op : *(candidate->pre_ops_)) {
+        if (!pre_op) continue;
+        if (visited.find(pre_op) == visited.end()) {
+          visited.insert(pre_op);
+          queue.push_back(pre_op);
+        }
+        ret[pre_op] += 1;
+      }
+    }
+    return ret;
+  }
+  framework::Scope* scope_;
+};
+framework::Variable* CreateVariable(const std::string& name,
+                                    const framework::DDim& dim, float val,
+                                    framework::Scope* scope,
+                                    bool random_name = true) {
+  std::string varname = name;
+  if (random_name) {
+    std::mt19937 rng;
+    rng.seed(std::random_device()());
+    std::uniform_int_distribution<std::mt19937::result_type> dist6(
+        1, std::numeric_limits<int>::max());
+    int id = dist6(rng);
+    varname = string::Sprintf("%s@%d", varname, id);
+  }
+  VLOG(3) << "creating var " << varname;
+  framework::Variable* var = scope->Var(varname);
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
+  std::fill(data, data + tensor->numel(), val);
+  return var;
+}
+framework::LoDTensor& VarBase::Grad() {
+  VLOG(3) << "get var grad " << var_desc_->Name();
+  return *grads_->GetMutable<framework::LoDTensor>();
+}
+void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
+  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
+          << grad->Get<framework::LoDTensor>().data<float>()[0];
+  if (!grads_) {
+    grads_ =
+        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
+                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+  }
+  AddTo(grad, grads_);
+  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
+          << grads_->Get<framework::LoDTensor>().data<float>()[0];
+}
+std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
+  VLOG(3) << "op grad " << grad_op_desc_->Type();
+  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
+    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
+      // grad op inputs can be forward inputs, so not in grad_to_var.
+      continue;
+    }
+    VLOG(3) << "op grad in var " << grad_invar;
+    block_->FindRecursiveOrCreateVar(grad_invar);
+    framework::Variable* var = scope->Var(grad_invar);
+    const std::string& invar = grad_to_var_->at(grad_invar);
+    for (VarBase* varbase : *output_vars_) {
+      // Use the accumulated grads_ by sharing the input with grads_.
+      if (varbase->var_desc_->Name() == invar) {
+        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
+            varbase->grads_->Get<framework::LoDTensor>());
+        break;
+      }
+    }
+  }
+  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+    VLOG(3) << "grad outvar " << outvar;
+    block_->FindRecursiveOrCreateVar(outvar);
+    framework::Variable* var = scope->Var(outvar);
+    if (!var->IsInitialized()) {
+      framework::VarDesc* var_desc = block_->FindVar(outvar);
+      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        var->GetMutable<framework::LoDTensor>();
+      } else {
+        LOG(ERROR) << "tracer doesn't support yet";
+      }
+    }
+  }
+  grad_op_desc_->InferShape(*block_);
+  grad_op_desc_->InferVarType(block_);
+  std::unique_ptr<framework::OperatorBase> opbase =
+      framework::OpRegistry::CreateOp(*grad_op_desc_);
+  opbase->Run(*scope, platform::CPUPlace());
+  // `ret` matches exactly with `input_vars_` of forward op.
+  std::vector<Variable*> ret;
+  for (size_t i = 0; i < input_vars_->size(); ++i) {
+    bool found = false;
+    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+      Variable* var = scope->FindVar(outvar);
+      VarBase* origin_var = (*input_vars_)[i];
+      std::string orig_var = grad_to_var_->at(outvar);
+      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
+      origin_var->ApplyGrad(scope, var);
+      found = true;
+      ret.push_back(var);
+      // TODO(panyx0718): There might be another outvar with the same name.
+      // In that case, it doesn't matter the first one or the second one is
+      // used.
+      break;
+    }
+    if (!found) {
+      ret.push_back(nullptr);
+    }
+  }
+  return ret;
+}
+void VarBase::RunBackward(framework::Scope* scope) {
+  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
+                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
+                          false);
+  if (!pre_op_) return;
+  Autograd(scope).RunBackward(this);
+}
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace imperative {
+class OpBase;
+class VarBase {
+ public:
+  VarBase()
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(nullptr),
+        grads_(nullptr) {}
+  virtual ~VarBase() {}
+  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+  void RunBackward(framework::Scope* scope);
+  framework::LoDTensor& Grad();
+  OpBase* pre_op_;
+  int pre_op_out_idx_;
+  framework::VarDesc* var_desc_;
+  framework::Variable* var_;
+  framework::Variable* grads_;
+};
+class OpBase {
+ public:
+  OpBase()
+      : input_vars_(new std::vector<VarBase*>()),
+        output_vars_(new std::vector<VarBase*>()),
+        pre_ops_(new std::vector<OpBase*>()),
+        pre_ops_out_idx_(new std::vector<int>()),
+        op_desc_(nullptr),
+        grad_op_desc_(nullptr) {}
+  virtual ~OpBase() {
+    delete input_vars_;
+    delete output_vars_;
+    delete pre_ops_;
+    delete pre_ops_out_idx_;
+    if (grad_op_desc_) delete grad_op_desc_;
+    if (grad_to_var_) delete grad_to_var_;
+  }
+  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+  std::vector<VarBase*>* input_vars_;
+  std::vector<VarBase*>* output_vars_;
+  std::vector<OpBase*>* pre_ops_;
+  std::vector<int>* pre_ops_out_idx_;
+  framework::OpDesc* op_desc_;
+  framework::OpDesc* grad_op_desc_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+  framework::BlockDesc* block_;
+};
+class Layer {
+ public:
+  virtual ~Layer() {}
+  virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
+    return vars;
+  }
+  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+};
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/imperative/tracer.h"
+namespace paddle {
+namespace imperative {}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/engine.h"
+#include "paddle/fluid/imperative/layer.h"
+namespace paddle {
+namespace imperative {
+void CreateGradOp(const framework::OpDesc& op_desc,
+                  const std::unordered_set<std::string>& no_grad_set,
+                  const std::vector<framework::BlockDesc*>& grad_sub_block,
+                  framework::OpDesc** grad_op_desc,
+                  std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+      framework::OpInfoMap::Instance()
+          .Get(op_desc.Type())
+          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  // TODO(panyx0718): Leak?
+  *grad_op_desc = grad_op_descs[0].release();
+}
+class Tracer {
+ public:
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+    root_scope_ = new framework::Scope();
+    scopes_[root_block_] = root_scope_;
+  }
+  virtual ~Tracer() { delete root_scope_; }
+  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
+             const std::vector<VarBase*>& outputs,
+             framework::BlockDesc* block) {
+    framework::Scope* scope = GetScope(block);
+    framework::OpDesc* op_desc = op->op_desc_;
+    VLOG(3) << "tracer tracing " << op_desc->Type();
+    op_desc->InferShape(*block);
+    op_desc->InferVarType(block);
+    std::unique_ptr<framework::OperatorBase> op_base =
+        framework::OpRegistry::CreateOp(*op_desc);
+    *op->input_vars_ = inputs;
+    for (VarBase* input : inputs) {
+      const std::string vname = input->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      input->var_ = var;
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      if (input->pre_op_) {
+        op->pre_ops_->push_back(input->pre_op_);
+        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
+      } else {
+        op->pre_ops_->push_back(nullptr);
+      }
+    }
+    *op->output_vars_ = outputs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      const std::string vname = outputs[i]->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      outputs[i]->var_ = var;
+      outputs[i]->pre_op_ = op;
+      outputs[i]->pre_op_out_idx_ = i;
+    }
+    op_base->Run(*scope, platform::CPUPlace());
+    framework::OpDesc* grad_op_desc;
+    auto grad_to_var = new std::unordered_map<std::string, std::string>();
+    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+    op->grad_op_desc_ = grad_op_desc;
+    op->grad_to_var_ = grad_to_var;
+    op->block_ = block;
+  }
+  framework::Scope* GetScope(framework::BlockDesc* block) {
+    if (scopes_.find(block) != scopes_.end()) {
+      return scopes_.at(block);
+    }
+    framework::BlockDesc* parent_block = block->ParentBlock();
+    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
+    framework::Scope* scope = &scopes_[parent_block]->NewScope();
+    scopes_[block] = scope;
+    return scope;
+  }
+ private:
+  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
+  framework::BlockDesc* root_block_;
+  framework::Scope* root_scope_;
+};
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -103,6 +103,7 @@ struct Argument {
  // Model specified with program and parameters files.
  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
  // The overall graph to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -115,6 +116,10 @@ struct Argument {
  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                      std::vector<std::string>);
+  // Pass a set of op types to enable its mkldnn kernel
+  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
+                      std::unordered_set<std::string>);
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
      pass_num++;
    }
+    if (pass_name == "mkldnn_placement_pass") {
+      pass->Set("mkldnn_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->mkldnn_enabled_op_types()));
+    }
    if (pass_name == "tensorrt_subgraph_pass") {
      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
             argument->model_params_path_valid()) {
    auto program =
        LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr(), place);
+                  argument->scope_ptr(), place, argument->model_from_memory());
    argument->SetMainProgram(program.release());
  } else {
    PADDLE_THROW(
@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
    const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope, const platform::Place &place) {
+    framework::Scope *scope, const platform::Place &place,
+    bool model_from_memory) {
  framework::Executor exe(place);
-  return Load(&exe, scope, program_path, params_path);
+  if (!model_from_memory) {
+    return Load(&exe, scope, program_path, params_path);
+  } else {
+    return LoadFromMemory(&exe, scope, program_path, params_path);
+  }
 }
 std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -24,7 +24,7 @@ namespace inference {
 namespace analysis {
 /*
- * Load program and parameter to memory from the disk.
+ * Load program and parameter to memory from the disk or directly from memory.
 */
 class IrGraphBuildPass : public AnalysisPass {
 public:
@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
      const platform::Place &place);
  std::unique_ptr<framework::ProgramDesc> LoadModel(
      const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope, const platform::Place &place);
+      framework::Scope *scope, const platform::Place &place,
+      bool model_from_memory);
  std::string model_binary_str_;
 };

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -49,10 +49,15 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
  // fields from this.
  enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
  use_feed_fetch_ops = other.use_feed_fetch_ops;
  use_tensorrt_ = other.use_tensorrt_;
  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
  if (use_gpu) {
    pass_builder_.reset(new GpuPassStrategy(
@@ -76,10 +81,16 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
  // fields from this.
  enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
  use_feed_fetch_ops = other.use_feed_fetch_ops;
  use_tensorrt_ = other.use_tensorrt_;
  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
  pass_builder_ = std::move(other.pass_builder_);
 }
@@ -102,4 +113,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
 }
+void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
+                                             size_t prog_buffer_size,
+                                             const char *param_buffer,
+                                             size_t param_buffer_size) {
+  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetUseGPU(config_.use_gpu);
  argument_.SetGPUDeviceId(config_.device);
+  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  if (!config_.model_dir.empty()) {
    argument_.SetModelDir(config_.model_dir);
@@ -326,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
  }
+  if (config_.use_mkldnn_) {
+    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
+  }
  auto passes = config_.pass_builder()->AllPasses();
  if (!config_.enable_ir_optim) passes.clear();
  argument_.SetIrAnalysisPasses(passes);
@@ -448,20 +453,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
    return false;
  }
-  std::string pb_content;
-  // Read binary
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  pb_content.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(pb_content.at(0)), pb_content.size());
-  fin.close();
  // Create ProgramDesc
  framework::proto::ProgramDesc proto;
-  proto.ParseFromString(pb_content);
+  if (!config_.model_from_memory()) {
+    std::string pb_content;
+    // Read binary
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
+                   filename);
+    fin.seekg(0, std::ios::end);
+    pb_content.resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(pb_content.at(0)), pb_content.size());
+    fin.close();
+    proto.ParseFromString(pb_content);
+  } else {
+    proto.ParseFromString(config_.prog_file);
+  }
  inference_program_.reset(new framework::ProgramDesc(proto));
  return true;
 }
@@ -469,6 +478,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 bool AnalysisPredictor::LoadParameters() {
  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                          "The inference program should be loaded first.");
  const auto &global_block = inference_program_->MutableBlock(0);
  // create a temporary program to load parameters.

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -16,6 +16,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 // Here we include some header files with relative paths, for that in deploy,
@@ -52,18 +53,26 @@ struct AnalysisConfig : public NativeConfig {
  bool use_tensorrt() const { return use_tensorrt_; }
  void EnableMKLDNN();
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
  bool use_mkldnn() const { return use_mkldnn_; }
+  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
+    mkldnn_enabled_op_types_ = op_list;
+  }
+  // Specify the memory buffer of program and parameter
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* program_buffer, size_t program_buffer_size);
+  bool model_from_memory() const { return model_from_memory_; }
  friend class ::paddle::AnalysisPredictor;
 protected:
  bool use_tensorrt_{false};
  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
  int tensorrt_workspace_size_;
  int tensorrt_max_batchsize_;
  std::unique_ptr<PassStrategy> pass_builder_;
+  bool model_from_memory_{false};
 };
 // Configurations for Anakin engine.

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy {
    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
    for (auto &pass :
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",  //
+         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
-                                   "conv_bias_mkldnn_fuse_pass",  //
+                                   "conv_bias_mkldnn_fuse_pass",    //
-                                   "conv_relu_mkldnn_fuse_pass",  //
+                                   "conv3d_bias_mkldnn_fuse_pass",  //
+                                   "conv_relu_mkldnn_fuse_pass",    //
                                   "conv_elementwise_add_mkldnn_fuse_pass"})) {
      passes_.push_back(pass);
    }

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const std::string& param_filename) {
+                      const std::string& param_filename,
+                      bool model_from_memory = false) {
  const framework::BlockDesc& global_block = main_program.Block(0);
  framework::ProgramDesc* load_program = new framework::ProgramDesc();
@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
    op->SetType("load_combine");
    op->SetOutput("Out", paramlist);
    op->SetAttr("file_path", {param_filename});
+    op->SetAttr("model_from_memory", {model_from_memory});
    op->CheckAttrs();
  }
@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                 "model version %ld is not supported.",
                 main_program->Version());
-  LoadPersistables(executor, scope, *main_program, dirname, "");
+  // model_from_memory is false in seperate parameters.
+  LoadPersistables(executor, scope, *main_program, dirname, "",
+                   false /* model_from_memory */);
  return main_program;
 }
 std::unique_ptr<framework::ProgramDesc> Load(
    framework::Executor* executor, framework::Scope* scope,
    const std::string& prog_filename, const std::string& param_filename) {
-  std::string model_filename = prog_filename;
  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
+  ReadBinaryFile(prog_filename, &program_desc_str);
  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(program_desc_str));
@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
                 "model version %ld is not supported.",
                 main_program->Version());
-  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  LoadPersistables(executor, scope, *main_program, "", param_filename,
+                   false /* model_from_memory */);
+  return main_program;
+}
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer) {
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(prog_buffer));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
+  LoadPersistables(executor, scope, *main_program, "", param_buffer,
+                   true /* model_filename */);
  return main_program;
 }

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const std::string& param_filename);
+                      const std::string& param_filename,
+                      bool model_from_memory);
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             framework::Scope* scope,
@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             const std::string& prog_filename,
                                             const std::string& param_filename);
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer);
 // Save the variables from a scope to disk.
 void SaveVars(const framework::Scope& scope,
              const std::vector<std::string>& vars, const std::string& dirname,

--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
    }
    if (pool_type == "max") {
-      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
+      // Under ceil mode, the pre_pad and post_pad are used to
-      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      // record the the padding size. In some ceil mode cases,
+      // we do not need padding, so we initialize the two vars to 0.
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
      if (ceil_mode) {
        // If ceil mode is true, we will pad the appropriate size to the input.
        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -188,10 +188,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 // Easy for profiling independently.
-TEST(Analyzer_dam, profile) {
+void profile(bool use_mkldnn = false) {
  contrib::AnalysisConfig cfg;
  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
@@ -209,6 +215,11 @@ TEST(Analyzer_dam, profile) {
  }
 }
+TEST(Analyzer_dam, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
 // Check the fuse status
 TEST(Analyzer_dam, fuse_statis) {
  contrib::AnalysisConfig cfg;
@@ -222,9 +233,14 @@ TEST(Analyzer_dam, fuse_statis) {
 }
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_dam, compare) {
+void compare(bool use_mkldnn = false) {
-  contrib::AnalysisConfig cfg;
+  AnalysisConfig cfg;
  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
@@ -233,5 +249,10 @@ TEST(Analyzer_dam, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+TEST(Analyzer_dam, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  if (memory_load) {
-  cfg->param_file = FLAGS_infer_model + "/param";
+    std::string buffer_prog, buffer_param;
+    ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
+    ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
+    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  } else {
+    cfg->prog_file = FLAGS_infer_model + "/__model__";
+    cfg->param_file = FLAGS_infer_model + "/param";
+  }
  cfg->use_gpu = false;
  cfg->device = 0;
  cfg->specify_input_name = true;
@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 // Easy for profiling independently.
-TEST(Analyzer_Chinese_ner, profile) {
+void profile(bool memory_load = false) {
  contrib::AnalysisConfig cfg;
-  SetConfig(&cfg);
+  SetConfig(&cfg, memory_load);
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
  }
 }
+TEST(Analyzer_Chinese_ner, profile) { profile(); }
+TEST(Analyzer_Chinese_ner, profile_memory_load) {
+  profile(true /* memory_load */);
+}
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
  contrib::AnalysisConfig cfg;

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
  os << GenSpaces(num_spaces)
     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
-  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
  os << GenSpaces(num_spaces)
     << "specify_input_name: " << config.specify_input_name << "\n";
  os << GenSpaces(num_spaces)
@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
  num_spaces++;
  os << *reinterpret_cast<const NativeConfig *>(&config);
+  if (!config.model_from_memory()) {
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  } else {
+    os << GenSpaces(num_spaces)
+       << "prog_file and param_file: load from memory \n";
+  }
  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
     << "\n";
  os << GenSpaces(num_spaces)

--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
+cc_binary(visualizer SRCS visualizer.cc DEPS analysis
+    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
+if(WIN32)
+  target_link_libraries(visualizer shlwapi)
+endif(WIN32)
--- a/paddle/fluid/inference/utils/visualizer.cc
+++ b/paddle/fluid/inference/utils/visualizer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/utils/visualizer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <fstream>
+#include <memory>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/platform/init.h"
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(model_program_path, "", "model program path");
+DEFINE_string(model_params_path, "", "model params path");
+USE_PASS(graph_viz_pass);
+USE_PASS(graph_to_program_pass);
+using paddle::inference::analysis::Argument;
+namespace paddle {
+namespace inference {
+namespace utils {
+void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
+bool Visualizer::Run() {
+  paddle::framework::InitDevices(false);
+  paddle::inference::analysis::Analyzer().Run(argument_);
+  return true;
+}
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
+// Generate a dot file describing the structure of graph.
+// To use this tool, run command: ./visualizer [options...]
+// Options:
+//     --model_dir: the directory of model
+//     --model_program_path: the path of program
+//     --model_params_path: the path of params
+int main(int argc, char *argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+  paddle::inference::analysis::Argument argument;
+  argument.SetUseGPU(false);
+  argument.SetUseTensorRT(false);
+  if (FLAGS_model_dir.empty()) {
+    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
+      LOG(ERROR) << "Please set model_dir"
+                    " or model_program_path and model_params_path";
+      return -1;
+    } else {
+      argument.SetModelProgramPath(FLAGS_model_program_path);
+      argument.SetModelParamsPath(FLAGS_model_params_path);
+    }
+  } else {
+    argument.SetModelDir(FLAGS_model_dir);
+  }
+  // Only 1 pass, default filename is 0_ir_origin.dot
+  // For more details, looking for paddle::inference::analysis::IRPassManager
+  argument.SetIrAnalysisPasses({"graph_viz_pass"});
+  std::unique_ptr<paddle::framework::Scope> scope{
+      new paddle::framework::Scope()};
+  argument.SetScopeNotOwned(
+      const_cast<paddle::framework::Scope *>(scope.get()));
+  paddle::inference::utils::Visualizer visualizer;
+  visualizer.SetArgument(&argument);
+  visualizer.Run();
+  return 0;
+}
--- a/paddle/fluid/inference/utils/visualizer.h
+++ b/paddle/fluid/inference/utils/visualizer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/argument.h"
+namespace paddle {
+namespace inference {
+namespace utils {
+using paddle::inference::analysis::Argument;
+class Visualizer final {
+ public:
+  Visualizer() = default;
+  ~Visualizer() = default;
+  Visualizer(const Visualizer &) = delete;
+  Visualizer &operator=(const Visualizer &) = delete;
+  void SetArgument(Argument *);
+  bool Run();
+ private:
+  Argument *argument_;
+};
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -100,8 +100,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
  const T *x_data = x->data<T>();
  T *y_data = y->mutable_data<T>(ctx.GetPlace());
-  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
+  PADDLE_ENFORCE(
-                 "Input dim must be with 2 or 4");
+      x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
+      "Input dim must be with 2, 3 or 4");
  std::vector<int> src_tz = framework::vectorize2int(x->dims());

--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/bpr_loss_op.h"
+namespace paddle {
+namespace operators {
+class BprLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+ protected:
+  // Explicitly set that the data type of computation kernel of Seq-bpr
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+class BprLossGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
+                      "Input(Label) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "The Input(X) and Input(Label) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
+    PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
+                      " the last dimension of Input(Label) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "real number.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. the last dimension "
+        "size is 1.");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the sequence bpr loss.");
+    AddComment(R"DOC(
+Bayesian Personalized Ranking Loss Operator.
+This operator belongs to pairwise ranking loss. Label is the desired item.
+The loss at a given point in one session is defined as:
+$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+Learn more details by reading paper <session-based recommendations with recurrent
+neural networks>(https://arxiv.org/abs/1511.06939)
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CPUCtx = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
+REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
+                       ops::BprLossOpKernel<CPUCtx, double>);
+REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
+                       ops::BprLossGradientOpKernel<CPUCtx, float>,
+                       ops::BprLossGradientOpKernel<CPUCtx, double>);
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+/*Todo:
+ *Find a way to adapt TolerableValue, using blas or eigen.
+ */
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+template <typename DeviceContext, typename T>
+class BprLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+    int rank = x->dims().size();
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+    const framework::Tensor* logits = &x_2d;
+    const framework::Tensor* labels = &labels_2d;
+    framework::Tensor* out = &y_2d;
+    const int step_size = logits->dims()[0];
+    const int class_num = logits->dims()[1];
+    const T* logits_data = logits->data<T>();
+    T* loss_data = out->data<T>();
+    const int64_t* label_data = labels->data<int64_t>();
+    for (int i = 0; i < step_size; ++i) {
+      int lbl_pos = label_data[i];
+      PADDLE_ENFORCE_GE(lbl_pos, 0);
+      PADDLE_ENFORCE_LT(lbl_pos, class_num);
+      int index_pos = i * class_num + lbl_pos;
+      T sum = static_cast<T>(0);
+      for (int j = 0; j < class_num; j++) {
+        if (j == lbl_pos) continue;
+        int index_neg = i * class_num + j;
+        sum += TolerableValue<T>()(-std::log(
+            1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
+                                                logits_data[index_pos]))));
+      }
+      loss_data[i] = -sum / (class_num - 1);
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class BprLossGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const int step_size = x->dims()[0];
+    const int num_classes = x->dims()[1];
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* dy_data = dy->data<T>();
+    const T* x_data = x->data<T>();
+    const int64_t* label_data = label->data<int64_t>();
+    for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
+      for (size_t x_offset = sample_id * num_classes;
+           x_offset < (sample_id + 1) * num_classes; x_offset++) {
+        dx_data[x_offset] = static_cast<T>(0);
+      }
+      auto p_index = sample_id * num_classes + label_data[sample_id];
+      for (size_t ni = 0; ni < num_classes; ni++) {
+        if (label_data[sample_id] == ni) continue;
+        auto n_index = sample_id * num_classes + ni;
+        auto grad_ = -dy_data[sample_id] /
+                     ((num_classes - 1) *
+                      (1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
+                                                           x_data[n_index]))));
+        dx_data[p_index] += grad_;
+        dx_data[n_index] -= grad_;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    auto x_dims = framework::vectorize(input->dims());
    auto f_dims = framework::vectorize(filter->dims());
-    if (activation == "identity") {
+    if (!exhaustive_search) {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    } else if (!exhaustive_search) {
      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
-    // ------------------- cudnn conv+bias+act forward --------------------
+    if ((activation == "identity") &&
-    ScalingParamType<T> alpha1 = 1.0f;
+        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
-    ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+        (!residual)) {
-    auto cudnn_func = [&](void* cudnn_workspace) {
+      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+      // But test in some case, the speed is slower, change to use
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+      // cudnnConvolutionForward and cudnnAddTensor
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+      // ------------- cudnn conv forward and bias add ---------------------
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
          output_data));
-    };
+    } else {
-    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      if (activation == "identity") {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      }
+      // ------------------- cudnn conv+bias+act forward --------------------
+      ScalingParamType<T> alpha1 = 1.0f;
+      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    }
  }
 };
 #endif

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -28,6 +28,46 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
+inline void GetWeightsTz(std::vector<int>& weights_tz, int groups,  // NOLINT
+                         bool is_conv3d) {
+  if (groups > 1) {
+    if (is_conv3d) {
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int dimension = weights_tz[2];
+      int height = weights_tz[3];
+      int width = weights_tz[4];
+      weights_tz.resize(6);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = dimension;
+      weights_tz[4] = height;
+      weights_tz[5] = width;
+    } else {
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int height = weights_tz[2];
+      int width = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = height;
+      weights_tz[4] = width;
+    }
+  }
+}
+inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
+                                               int groups, bool is_conv3d) {
+  if (is_conv3d) {
+    return (groups == 1) ? format : mkldnn::memory::format::goidhw;
+  } else {
+    return (groups == 1) ? format : mkldnn::memory::format::goihw;
+  }
+}
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                       filter->format() != memory::format::format_undef,
                   "Wrong layout/format set for Filter tensor");
-    PADDLE_ENFORCE(input->dims().size() == 4,
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
-                   "Input must be with 4 dimensions, i.e. NCHW");
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
    if (bias) {
      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
                         bias->format() != memory::format::format_undef,
@@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
    int groups = ctx.Attr<int>("groups");
+    bool is_conv3d = strides.size() == 3U;
    // TODO(tpatejko): add support for dilation
    PADDLE_ENFORCE(
-        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        is_conv3d
+            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
+                  dilations[2] == 1
+            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
        "dilation in convolution is not implemented yet");
    const T* input_data = input->data<T>();
@@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
    int g = std::max(groups, 1);
-    if (g > 1) {
+    GetWeightsTz(weights_tz, g, is_conv3d);
-      int o = weights_tz[0];
-      int i = weights_tz[1];
-      int h = weights_tz[2];
-      int w = weights_tz[3];
-      weights_tz.resize(5);
-      weights_tz[0] = g;
-      weights_tz[1] = o / g;
-      weights_tz[2] = i;
-      weights_tz[3] = h;
-      weights_tz[4] = w;
-    }
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
    // Get unique name for storing MKLDNN primitives
@@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<primitive> pipeline;
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(),
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);
    /* create memory descriptor for convolution without specified format
     * ('any') which lets a primitive (convolution in this case) choose
@@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto chosen_memory_format =
        platform::data_format_to_memory_format(data_format);
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
    auto src_md = platform::MKLDNNMemDesc(
        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                               // Currently used whenever bias is != nullptr.
    auto dst_md = platform::MKLDNNMemDesc(
@@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const mkldnn::engine& engine, const bool fuse_relu,
                       const bool fuse_residual_conn,
                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims stride_dims = strides;
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims padding_dims = paddings;
    auto conv_desc = mkldnn::convolution_forward::desc(
        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
@@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const mkldnn::engine& engine, const bool fuse_relu,
                       const bool fuse_residual_conn,
                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims stride_dims = strides;
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims padding_dims = paddings;
    auto conv_desc = mkldnn::convolution_forward::desc(
        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
@@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
    int groups = ctx.Attr<int>("groups");
+    bool is_conv3d = strides.size() == 3U;
    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
    const T* output_grad_data = output_grad->data<T>();
@@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    GetWeightsTz(weights_tz, g, is_conv3d);
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
    // Get an unique name from "argument" name of "Output" variable
    // as well as attributes of primitive to be created
    // This name will be used as key when saving info into device context
@@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    // Create user memory descriptors
    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
    auto user_diff_dst_md = platform::MKLDNNMemDesc(
        {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
@@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto chosen_memory_format =
        platform::data_format_to_memory_format(data_format);
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
    auto src_md = platform::MKLDNNMemDesc(
        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
    auto diff_src_md = platform::MKLDNNMemDesc(
        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
    auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
    auto diff_dst_md = platform::MKLDNNMemDesc(
        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
@@ -500,3 +555,13 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
                                    ::paddle::platform::CPUPlace, FP32,
                                    ops::kConvMKLDNNFP32,
                                    ops::ConvMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -134,14 +134,14 @@ void Conv2DOpMaker::Make() {
           "The format of output tensor is X (one-dimensional) of size equal"
           "to the number of output channels. Only used with MKL-DNN.")
      .AsDispensable();
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
  AddInput("ResidualData",
           "(Tensor) Tensor with residual data "
           "to which convolution output will be added."
           "Used with fuse_residual_connection fusion.")
      .AsDispensable();
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>("strides",
                            "(vector<int> default:{1, 1}), the "
                            "strides(h_stride, w_stride) of "
@@ -232,6 +232,10 @@ $$
 }
 void Conv3DOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "
@@ -247,6 +251,11 @@ void Conv3DOpMaker::Make() {
           "is the width of the filter."
           "If the groups attribute is greater than 1, C equals the number of "
           "input image channels divided by the groups.");
+  AddInput("ResidualData",
+           "(Tensor) Tensor with residual data "
+           "to which convolution output will be added."
+           "Used with fuse_residual_connection fusion.")
+      .AsDispensable();
  AddOutput("Output",
            "(Tensor) The output tensor of convolution operator."
            "The format of output tensor is also NCDHW.");
@@ -280,6 +289,13 @@ void Conv3DOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_residual_connection",
+                "(bool, default false) Only used in mkldnn kernel. Used "
+                "whenever convolution output is as an input to residual "
+                "connection.")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor
 if(WITH_GRPC)
  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows memory)
+      DEPS lod_tensor selected_rows_functor memory)
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      selected_rows_functor  scope math_function SERIAL)
+  endif()
  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
 else()
  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc

--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <condition_variable>  // NOLINT
+#include <string>
+#include "gflags/gflags.h"
+#include "paddle/fluid/operators/distributed/collective_client.h"
+DECLARE_int32(rpc_deadline);
+namespace paddle {
+namespace operators {
+namespace distributed {
+std::once_flag CollectiveClient::init_flag_;
+std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
+bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
+                              std::vector<const framework::SelectedRows*>* dst,
+                              const platform::DeviceContext& ctx,
+                              framework::Scope* scope, int64_t time_out) {
+  for (auto r : remote_vars) {
+    VLOG(50) << "begin gather from ep:" << r.String();
+    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
+    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
+        r.ep_, ctx, *scope, r.var_name_, time_out);
+  }
+  rpc_client_->Wait();
+  for (auto r : remote_vars) {
+    auto select_rows =
+        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
+    dst->push_back(select_rows);
+    VLOG(4) << "gather from ep:" << r.String()
+            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
+    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
+  }
+  rpc_client_->Wait();
+  return true;
+}
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <condition_variable>  // NOLINT
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+DECLARE_int32(rpc_deadline);
+namespace paddle {
+namespace operators {
+namespace distributed {
+inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
+  std::stringstream ss;
+  ss << ", height:" << slr.height() << ", rows:[";
+  for (unsigned int i = 0; i < slr.rows().size(); i++) {
+    if (i != slr.rows().size() - 1) {
+      ss << slr.rows()[i] << ",";
+    } else {
+      ss << slr.rows()[i];
+    }
+  }
+  ss << "], dims:" << slr.value().dims();
+  return ss.str();
+}
+struct RemoteVar {
+  std::string ep_;
+  std::string var_name_;
+  int trainer_id_{0};
+  std::string String() {
+    std::stringstream ss;
+    ss << "ep:" << ep_ << ", var_name:" << var_name_
+       << ", trainer_id:" << trainer_id_;
+    return ss.str();
+  }
+};
+class CollectiveClient {
+ public:
+  CollectiveClient() {
+    rpc_client_.reset(new RPCCLIENT_T());
+    rpc_client_->InitImpl();
+  }
+  virtual ~CollectiveClient() {}
+  // note this function will retain the rank order.
+  bool Gather(const std::vector<RemoteVar>& remote_vars,
+              std::vector<const framework::SelectedRows*>* dst,
+              const platform::DeviceContext& ctx, framework::Scope* scope,
+              int64_t time_out = FLAGS_rpc_deadline);
+  static CollectiveClient* GetInstance() {
+    std::call_once(init_flag_, [&]() {
+      if (client_.get() == nullptr) {
+        client_.reset(new CollectiveClient());
+      }
+    });
+    return client_.get();
+  }
+ private:
+  std::unique_ptr<RPCClient> rpc_client_;
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveClient> client_;
+};
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/operators/distributed/collective_server.h"
+DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
+namespace paddle {
+namespace operators {
+namespace distributed {
+std::once_flag CollectiveServer::init_flag_;
+std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
+CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
+  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
+  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
+}
+void CollectiveServer::Stop() {
+  rpc_server_->ShutDown();
+  server_thread_->join();
+  loop_thread_->join();
+}
+void CollectiveServer::StartServer() {
+  get_monomer_handler_.reset(new GetMonomerHandler());
+  get_monomer_handler_->SetRPCServer(rpc_server_.get());
+  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
+  get_barrier_handler_->SetRPCServer(rpc_server_.get());
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
+                           get_monomer_handler_.get(),
+                           FLAGS_collective_get_thread_num);
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
+                           get_barrier_handler_.get(), 1);
+  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
+  rpc_server_->WaitServerReady();
+  loop_thread_.reset(new std::thread([&]() {
+    while (true) {
+      if (rpc_server_->IsExit()) {
+        LOG(WARNING) << "get exit!rpc_processor break!";
+        break;
+      }
+      sleep(1);
+    }
+    VLOG(1) << "CollectiveServer loop_thread end";
+  }));
+}
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+namespace paddle {
+namespace operators {
+namespace distributed {
+class CollectiveServer;
+class GetMonomerHandler final : public RequestHandler {
+ public:
+  GetMonomerHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+    *outvar = scope->FindVar(var_name);
+    PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name);
+    return true;
+  }
+};
+class GetMonomerBarrierHandler final : public RequestHandler {
+ public:
+  GetMonomerBarrierHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerBarrierHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+    rpc_server_->IncreaseVarBarrier(var_name);
+    return true;
+  }
+};
+class CollectiveServer final {
+ public:
+  explicit CollectiveServer(const std::string& end_point, int fan_in);
+  virtual ~CollectiveServer() {}
+  void StartServer();
+  static CollectiveServer* GetInstance(const std::string& end_point,
+                                       int fan_in) {
+    std::call_once(init_flag_, [&]() {
+      if (collective_server_.get() == nullptr) {
+        collective_server_.reset(new CollectiveServer(end_point, fan_in));
+        collective_server_->StartServer();
+      }
+    });
+    return collective_server_.get();
+  }
+  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
+  void Stop();
+ private:
+  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
+  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
+  std::shared_ptr<distributed::RPCServer> rpc_server_;
+  std::shared_ptr<std::thread> server_thread_;
+  std::shared_ptr<std::thread> loop_thread_;
+  bool ready_{false};
+  static std::once_flag init_flag_;
+  static std::shared_ptr<CollectiveServer> collective_server_;
+};
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::operators::distributed;
+std::unique_ptr<distributed::CollectiveServer> StartServer(
+    const std::string& ep, int fan_in, framework::Scope* scope,
+    platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveServer* server =
+      distributed::CollectiveServer::GetInstance(ep, fan_in);
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
+                          scope, dev_ctx);
+  std::cout << "StartServer return" << std::endl;
+  return std::unique_ptr<distributed::CollectiveServer>(server);
+}
+std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->mutable_data<float>(place);
+  paddle::operators::math::set_constant(ctx, tensor, 32.7);
+  for (int i = 0; i < 3; ++i) rows->push_back(i);
+  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
+  return std::unique_ptr<framework::Scope>(scope);
+}
+void Gather(const std::vector<distributed::RemoteVar>& vars,
+            platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveClient* client =
+      distributed::CollectiveClient::GetInstance();
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  var->GetMutable<framework::SelectedRows>();
+  std::vector<const framework::SelectedRows*> dst;
+  client->Gather(vars, &dst, *dev_ctx, scope);
+  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+}
+TEST(PREFETCH, GPU) {
+  platform::CUDAPlace place;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  std::string ep = "127.0.0.1:7164";
+  auto scope = GenerateVars(place);
+  auto* v1 = scope->FindVar("var1");
+  std::cout << "var1:" << v1 << std::endl;
+  auto server = StartServer(ep, 2, scope.get(), &ctx);
+  auto rpc_server = server->GetRPCServer();
+  distributed::RemoteVar var;
+  var.ep_ = ep;
+  var.var_name_ = "var1";
+  var.trainer_id_ = 0;
+  std::vector<distributed::RemoteVar> vars{var};
+  Gather(vars, &ctx);
+  Gather(vars, &ctx);
+  std::cout << "begin WaitVarBarrier" << std::endl;
+  rpc_server->WaitVarBarrier("var1");
+  rpc_server->ClearRegisteredVars();
+  server->Stop();
+  scope.release();
+  server.release();
+}
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -28,11 +28,11 @@ namespace paddle {
 namespace operators {
 namespace distributed {
-void GRPCClient::InitImpl() { InitEventLoop(); }
+void GRPCClient::InitImpl() {
-void GRPCClient::InitEventLoop() {
  // start the client process thread
  // TODO(wuyi): can make this in a threadpool
+  PADDLE_ENFORCE(client_thread_ == nullptr,
+                 "please not re init proceed thread");
  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
@@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 void ProcGetResponse(const VarHandle& var_h,
                     const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(100) << "ProcGetResponse";
  framework::Variable* outvar = nullptr;
  // get response's trainer_id is not used
  int trainer_id;
@@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                     const framework::Scope& scope,
                                     const std::string& var_name,
                                     int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetVariable", time_out);
+}
+VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
+}
+VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      const std::string& rpc_path,
+                                      int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
@@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
+  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
@@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
    platform::RecordRPCEvent record_event(method, p_ctx);
-    auto call = s->stub_g_.PrepareUnaryCall(
+    auto call =
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  return h;
 }
+VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  const std::string method = "SendMonomerFetchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+  platform::RecordRPCEvent record_event(method, nullptr);
+  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+  return h;
+}
 VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
                                           int64_t time_out) {
  const auto ch = GetChannel(ep);

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -189,6 +189,11 @@ class GRPCClient : public RPCClient {
                           const std::string& var_name,
                           int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
@@ -200,8 +205,12 @@ class GRPCClient : public RPCClient {
  VarHandlePtr AsyncSendBatchBarrier(
      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-  VarHandlePtr AsyncSendFetchBarrier(
+  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+                                     int64_t time_out) override;
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
  VarHandlePtr AsyncCheckpointNotify(
      const std::string& ep, const std::string& dir,
@@ -214,21 +223,22 @@ class GRPCClient : public RPCClient {
  void SendComplete() override;
- protected:
  void InitImpl() override;
 private:
-  // InitEventLoop should only be called by Init()
-  void InitEventLoop();
  void Proceed();
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name, const std::string& rpc,
+                            int64_t time_out);
 private:
  grpc::CompletionQueue cq_;
  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::unique_ptr<std::thread> client_thread_;
+  std::unique_ptr<std::thread> client_thread_{nullptr};
  // mutex for Wait client sync
  std::mutex sync_mutex_;

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -158,6 +158,98 @@ class RequestGet final : public RequestBase {
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
 };
+class RequestGetMonomerVariable final : public RequestBase {
+ public:
+  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
+                                     ::grpc::ServerCompletionQueue* cq,
+                                     RequestHandler* request_handler,
+                                     int req_id, RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+  virtual ~RequestGetMonomerVariable() {}
+  std::string GetReqName() override { return request_.varname(); }
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
+class RequestGetMonomerBarrier final : public RequestBase {
+ public:
+  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
+                                    ::grpc::ServerCompletionQueue* cq,
+                                    RequestHandler* request_handler, int req_id,
+                                    RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+  virtual ~RequestGetMonomerBarrier() {}
+  std::string GetReqName() override { return request_.varname(); }
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    VLOG(4) << "RequestGetMonomerBarrier " << varname;
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+    framework::Scope* scope = nullptr;
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+    Finish(reply_, &responder_);
+  }
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
 class RequestPrefetch final : public RequestBase {
 public:
  explicit RequestPrefetch(GrpcService::AsyncService* service,
@@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is waiting server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
@@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
    b = new RequestSend(&service_, cq.get(), handler, req_id);
  } else if (rpc_name == kRequestGet) {
    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGetMonomerVariable) {
+    b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
+                                      this);
+  } else if (rpc_name == kRequestGetMonomerBarrier) {
+    b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id,
+                                     this);
  } else if (rpc_name == kRequestPrefetch) {
    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
  } else if (rpc_name == kRequestCheckpoint) {
@@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
  reqs[req_id] = b;
-  VLOG(4) << "Create RequestSend status:" << b->Status();
+  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
 }
 void AsyncGRPCServer::HandleRequest(

--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -81,10 +81,12 @@ enum class GrpcMethod {
  kGetVariable,
  kPrefetchVariable,
  kCheckpointNotify,
+  kGetMonomerVariable,
+  kGetMonomerBarrier,
 };
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
+    static_cast<int>(GrpcMethod::kGetMonomerBarrier) + 1;
 inline const char* GrpcMethodName(GrpcMethod id) {
  switch (id) {
@@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) {
      return "/sendrecv.SendRecvService/SendVariable";
    case GrpcMethod::kGetVariable:
      return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetMonomerVariable:
+      return "/sendrecv.SendRecvService/GetMonomerVariable";
+    case GrpcMethod::kGetMonomerBarrier:
+      return "/sendrecv.SendRecvService/GetMonomerBarrier";
    case GrpcMethod::kPrefetchVariable:
      return "/sendrecv.SendRecvService/PrefetchVariable";
    case GrpcMethod::kCheckpointNotify:

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -37,6 +37,8 @@ namespace distributed {
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
+constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -45,6 +45,11 @@ class RPCClient {
                                   const std::string& var_name,
                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
  virtual VarHandlePtr AsyncPrefetchVar(
      const std::string& ep, const platform::DeviceContext& ctx,
      const framework::Scope& scope, const std::string& in_var_name,
@@ -57,6 +62,10 @@ class RPCClient {
  virtual VarHandlePtr AsyncSendFetchBarrier(
      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
  virtual VarHandlePtr AsyncCheckpointNotify(
      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) = 0;
@@ -87,8 +96,9 @@ class RPCClient {
    }
  }
- protected:
  virtual void InitImpl() {}
+ protected:
  // each trainer have exact one trainer id, it should be static
  static int trainer_id_;

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
 }
+void RPCServer::RegisterVar(const std::string& var_name,
+                            const std::string& rpc_name,
+                            framework::Scope* scope,
+                            platform::DeviceContext* dev_ctx) {
+  MonomerHandle h;
+  h.var_name_ = var_name;
+  h.rpc_name_ = rpc_name;
+  h.scope_ = scope;
+  h.dev_ctx_ = dev_ctx;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
+    }
+    var_map_[var_name] = h;
+  }
+  rpc_cond_.notify_all();
+  VLOG(4) << "RegisterVar context:" << h.String();
+}
+void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
+  int b = 0;
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++var_map_[var_name].barrier_;
+    h = var_map_[var_name];
+  }
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+}
+void RPCServer::WaitVarBarrier(const std::string& var_name) {
+  VLOG(4) << "WaitBarrier var_name:" << var_name;
+  std::unique_lock<std::mutex> lock(mutex_);
+  barrier_cond_.wait(lock, [&]() {
+    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
+  });
+  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+}
+void RPCServer::SetVarCond(const std::string& var_name) {
+  VLOG(4) << "SetVarCond var_name:" << var_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      rpc_cond_.notify_all();
+    }
+  }
+}
+void RPCServer::WaitVarCond(const std::string& var_name) {
+  VLOG(4) << "WaitVarCond var_name:" << var_name;
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(lock, [=] {
+    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
+  });
+  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+}
+MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    h = var_map_[var_name];
+  }
+  return h;
+}
+void RPCServer::ClearRegisteredVars() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.clear();
+}
+void RPCServer::ClearVar(const std::string& var_name) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.erase(var_name);
+}
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -21,12 +21,30 @@
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace operators {
 namespace distributed {
+struct MonomerHandle {
+  std::string var_name_;
+  std::string rpc_name_;
+  framework::Scope* scope_{nullptr};
+  platform::DeviceContext* dev_ctx_{nullptr};
+  int64_t barrier_{0};
+  std::string String() {
+    std::stringstream ss;
+    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
+       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
+       << ", barrier_:" << barrier_;
+    return ss.str();
+  }
+};
 class RPCServer {
 public:
  explicit RPCServer(const std::string& address, int client_num)
@@ -67,6 +85,16 @@ class RPCServer {
  void WaitCond(const std::string& rpc_name);
  void IncreaseBatchBarrier(const std::string rpc_name);
+  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
+                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
+  void IncreaseVarBarrier(const std::string& var_name);
+  void WaitVarBarrier(const std::string& var_name);
+  void SetVarCond(const std::string& var_name);
+  void WaitVarCond(const std::string& var_name);
+  void ClearRegisteredVars();
+  void ClearVar(const std::string& var_name);
+  MonomerHandle GetMonomer(const std::string& var_name);
  void Complete();
  void ResetBarrierCounter();
@@ -95,6 +123,9 @@ class RPCServer {
  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
  std::unordered_map<std::string, int> rpc_thread_num_;
  friend class RequestHandler;
+  // TODO(gongwb): use more cond to notify or wait;
+  std::unordered_map<std::string, MonomerHandle> var_map_;
 };
 };  // namespace distributed

--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -28,6 +28,9 @@ service SendRecvService {
  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 // VariableMessage is serialized paddle variable message.

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto load_as_fp16 = Attr<bool>("load_as_fp16");
+    auto model_from_memory = Attr<bool>("model_from_memory");
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
-                   "Cannot open file %s for load_combine op", filename);
    auto out_var_names = Outputs("Out");
    PADDLE_ENFORCE_GT(
        static_cast<int>(out_var_names.size()), 0,
        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+  void LoadParamsFromBuffer(
+      const framework::Scope &scope, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
-                     filename);
      // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                         "LoDTensors will be loaded from \"file_path\".")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
+    AddAttr<bool>("model_from_memory",
+                  "(boolean, default false)"
+                  "If true, file_path is in memory, and LoDTensors will be "
+                  "loaded directly from memory")
+        .SetDefault(false);
    AddComment(R"DOC(
 LoadCombine Operator.
-LoadCombine operator loads LoDTensor variables from a file. The file should 
+LoadCombine operator loads LoDTensor variables from a file, which could be 
-contain one or more LoDTensors serialized using the SaveCombine operator. The 
+loaded in memory already. The file should contain one or more LoDTensors 
+serialized using the SaveCombine operator. The
 LoadCombine operator applies a deserialization strategy to appropriately load 
 the LodTensors, and this strategy complements the serialization strategy used 
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"

--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
      auto rows_idx = outs_rows_idx[i];
      outs[i]->set_height(height_sections[i]);
+      auto dims = x->GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+      outs[i]->mutable_rows()->clear();
      if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
        for (auto idx : rows_idx) {
          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
        }
@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
          }
        }
      }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
    }
  }
 };

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -120,15 +120,24 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  }
  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
    auto buf = paddle::memory::Alloc(place_, num_bytes,
                                     memory::Allocator::kScratchpad);
    void* retv = buf->ptr();
-    allocations_[buf->ptr()] = std::move(buf);
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
    return retv;
  }
  void deallocate(void* buffer) const override {
-    allocations_.erase(allocations_.find(buffer));
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
  }
  void* scratchpad() const override {
@@ -155,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  const cudaDeviceProp* device_prop_;  // not owned;
  mutable void* scratch_;
  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
 };
@@ -210,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
                          << (cudnn_dso_ver % 100) / 10 << ".";
+  {
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version = runtime_version_ / 100;
+    auto compile_cuda_version = CUDA_VERSION / 100;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place_.device
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+    if (dynload::HasCUDNN()) {
+      auto local_cudnn_version = cudnn_dso_ver / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cuda_version < compile_cuda_version) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+    }
+  }
  callback_manager_.reset(new StreamCallbackManager(stream_));
 }

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
    return mkldnn::memory::format::x;
  } else if (dims_size == 2) {
    return mkldnn::memory::format::nc;
+  } else if (dims_size == 3) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::nwc;
+    }
+  } else if (dims_size == 5) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncdhw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::ndhwc;
+    }
  }
  return data_format;
 }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/tracer.h"
+namespace paddle {
+namespace pybind {
+// Bind Methods
+void BindTracer(pybind11::module *m) {
+  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
+      .def("__init__",
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
+           })
+      .def("trace", &imperative::Tracer::Trace)
+      .def("get_scope", &imperative::Tracer::GetScope,
+           pybind11::return_value_policy::reference);
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <Python.h>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+namespace paddle {
+namespace pybind {
+class PyLayer : public imperative::Layer {
+ public:
+  using imperative::Layer::Layer;  // Inherit constructors
+  std::vector<imperative::VarBase> Forward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
+                      inputs);  // NOLINT
+  }
+  void Backward() override {
+    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
+  }
+};
+class PyOpBase : public imperative::OpBase {
+ public:
+  using imperative::OpBase::OpBase;  // Inherit constructors
+};
+class PyVarBase : public imperative::VarBase {
+ public:
+  using imperative::VarBase::VarBase;  // Inherit constructors
+};
+void BindTracer(pybind11::module* m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -45,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) {
  BindException(&m);
+  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def("_run_backward",
+           [](imperative::VarBase &self, framework::Scope *scope) {
+             self.RunBackward(scope);
+           })
+      .def("_grad", &imperative::VarBase::Grad)
+      .def_property(
+          "desc",
+          [](const imperative::VarBase &self) { return self.var_desc_; },
+          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
+            self.var_desc_ = var_desc;
+          },
+          py::return_value_policy::reference);
+  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def_property(
+          "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
+          [](imperative::OpBase &self, framework::OpDesc *op_desc) {
+            if (op_desc) {
+              self.op_desc_ = op_desc;
+            }
+          },
+          py::return_value_policy::reference);
+  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  layer.def(py::init<>())
+      .def("forward",
+           [](imperative::Layer &self,
+              const std::vector<imperative::VarBase> &inputs) {
+             return self.Forward(inputs);
+           })
+      .def("backward", &imperative::Layer::Backward);
+  BindTracer(&m);
  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer(
          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -298,6 +336,8 @@ PYBIND11_MODULE(core, m) {
      .def("get_tensor",
           [](SelectedRows &self) { return self.mutable_value(); },
           py::return_value_policy::reference)
+      .def("numel",
+           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
      .def("set_height", &SelectedRows::set_height)
      .def("height", &SelectedRows::height)
      .def("set_rows",
@@ -601,6 +641,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("set_feed_variable", framework::SetFeedVariable);
  m.def("get_fetch_variable", framework::GetFetchVariable);
+  m.def("get_variable_tensor", framework::GetVariableTensor);
  m.def("_is_program_version_supported", IsProgramVersionSupported);
@@ -886,6 +927,18 @@ All parameter, weight, gradient are variables in Paddle.
          [](BuildStrategy &self, int num_trainers) {
            self.num_trainers_ = num_trainers;
          })
+      .def_property(
+          "trainers_endpoints",
+          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
+          [](BuildStrategy &self,
+             const std::vector<std::string> &trainers_endpoints) {
+            self.trainers_endpoints_ = trainers_endpoints;
+          })
+      .def_property("trainer_id",
+                    [](const BuildStrategy &self) { return self.trainer_id_; },
+                    [](BuildStrategy &self, int trainer_id) {
+                      self.trainer_id_ = trainer_id;
+                    })
      .def_property(
          "fuse_elewise_add_act_ops",
          [](const BuildStrategy &self) {

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
    paddle::platform::CPUPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (int i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -32,11 +32,28 @@ the image layout as follows.
 from __future__ import print_function
+import six
 import numpy as np
-try:
+# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here
-    import cv2
+# https://github.com/numpy/numpy/issues/12497
-except ImportError:
+if six.PY3:
-    cv2 = None
+    import subprocess
+    import sys
+    import_cv2_proc = subprocess.Popen(
+        [sys.executable, "-c", "import cv2"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE)
+    out, err = import_cv2_proc.communicate()
+    retcode = import_cv2_proc.poll()
+    if retcode != 0:
+        cv2 = None
+    else:
+        import cv2
+else:
+    try:
+        import cv2
+    except ImportError:
+        cv2 = None
 import os
 import tarfile
 import six.moves.cPickle as pickle

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,6 +34,7 @@ from . import io
 from . import evaluator
 from . import initializer
 from . import layers
+from . import imperative
 from . import contrib
 from . import nets
 from . import optimizer
@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'initializer',
        'layers',
        'contrib',
+        'imperative',
        'transpiler',
        'nets',
        'optimizer',

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -258,10 +258,13 @@ class DataFeeder(object):
        multiple mini-batches. Each mini-batch will be feed on each device.
        Args:
-            reader(fun): the input data.
+            reader(function): the reader is the function which can generate data.
-            multi_devices(bool): the number of places. Default None.
+            multi_devices(bool): whether to use multiple devices or not.
-            num_places(int): the number of places. Default None.
+            num_places(int): if the multi_devices is True, you can specify the number
-            drop_last(bool): the number of places. Default None.
+                of GPU to use, if 'num_places' is None, the function will use all the
+                GPU of the current machine. Default None.
+            drop_last(bool): whether to drop the last batch if the
+                size of the last batch is less than batch_size. Default True.
        Returns:
            dict: the result of conversion.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import sys
 import numpy as np
@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
+_imperative_tracer_ = None
+def _in_imperative_mode():
+    return _imperative_tracer_ is not None
+def _imperative_tracer():
+    return _imperative_tracer_
 class NameScope(object):
    def __init__(self, name="", parent=None):
@@ -345,6 +356,21 @@ class Variable(object):
        self.op = None
        self.stop_gradient = stop_gradient
        self.is_data = is_data
+        if _in_imperative_mode():
+            self._ivar = core.VarBase()
+            self._ivar.desc = self.desc
+    def _numpy(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        tensor = core.get_variable_tensor(scope, self.desc.name())
+        return np.array(tensor)
+    def _backward(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        self._ivar._run_backward(scope)
+    def _gradient(self):
+        return np.array(self._ivar._grad())
    def __str__(self):
        return self.to_string(True)
@@ -655,6 +681,23 @@ class Operator(object):
        if self._has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)
+        if _in_imperative_mode():
+            self.iop = core.OpBase()
+            self.iop.desc = self.desc
+            self.inputs = []
+            if inputs is not None:
+                for inp in inputs.values():
+                    if isinstance(inp, Variable):
+                        self.inputs.append(inp)
+                    elif isinstance(inp, list) or isinstance(inp, tuple):
+                        self.inputs.extend(inp[:])
+            self.outputs = []
+            if outputs is not None:
+                for out in outputs.values():
+                    if isinstance(out, Variable):
+                        self.outputs.append(out)
+                    elif isinstance(out, list) or isinstance(out, tuple):
+                        self.outputs.extend(out[:])
    def _has_kernel(self, op_type):
        return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1041,19 +1084,15 @@ class Block(object):
            raise ValueError("var %s not in this block" % name)
        return v
-    def _var_recursive(self, name):
+    def _find_var_recursive(self, name):
        """
        Get a Variable by name from this block recursively.
        Args:
            name(str): the Variable's name.
-        Raises:
-            ValueError: this block and this parent block doesn't
-                have a Variable with the giving name.
        Returns:
-            Variable: the Variable with the giving name.
+            Variable: the Variable with the giving name. Or None if not found.
        """
        frontier = list()
        visited = set()
@@ -1079,8 +1118,27 @@ class Block(object):
                frontier.append(prog.block(cur.forward_block_idx))
            visited.add(id(cur))
+        return None
-        raise ValueError("Var {0} is not found recursively".format(name))
+    def _var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+        Args:
+            name(str): the Variable's name.
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+        Returns:
+            Variable: the Variable with the giving name.
+        """
+        var = self._find_var_recursive(name)
+        if var:
+            return var
+        else:
+            raise ValueError("Var {0} is not found recursively".format(name))
    def all_parameters(self):
        return list(self.iter_parameters())
@@ -1206,6 +1264,9 @@ class Block(object):
        """
        op_desc = self.desc.append_op()
        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+                                       [v._ivar for v in op.outputs], self.desc)
        self.ops.append(op)
        return op
@@ -1442,6 +1503,7 @@ class Program(object):
        self._is_chief = False
        self._slice_vars_and_attrs = []
        self._endpoints = []
+        self._trainers_endpoints = []
        self._distributed_lookup_table = None
    @property
@@ -2209,3 +2271,12 @@ def _get_var(name, program=None):
    assert isinstance(program, Program)
    return program.global_block().var(name)
+@contextlib.contextmanager
+def _imperative_guard(tracer):
+    global _imperative_tracer_
+    tmp_trace = _imperative_tracer_
+    _imperative_tracer_ = tracer
+    yield
+    _imperative_tracer_ = tmp_trace
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from . import base
+from .base import *
+from . import layers
+from .layers import *
+__all__ = []
+__all__ += layers.__all__
+__all__ += base.__all__
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import numpy as np
+from paddle.fluid import core
+from paddle.fluid import framework
+__all__ = ['enabled', 'guard', 'to_variable']
+def enabled():
+    return framework._in_imperative_mode()
+@contextlib.contextmanager
+def guard():
+    train = framework.Program()
+    startup = framework.Program()
+    tracer = core.Tracer(train.current_block().desc)
+    with framework.program_guard(train, startup):
+        with framework.unique_name.guard():
+            with framework._imperative_guard(tracer):
+                yield
+def to_variable(value, block=None):
+    if isinstance(value, np.ndarray):
+        if not block:
+            block = framework.default_main_program().current_block()
+        py_var = framework.Variable(
+            block,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            name=None,
+            shape=value.shape,
+            dtype=value.dtype)
+        scope = framework._imperative_tracer().get_scope(block.desc)
+        var = scope.var(py_var.name)
+        tensor = var.get_tensor()
+        tensor.set(value, core.CPUPlace())
+        return py_var
+    elif isinstance(value, framework.Variable):
+        return value
+    else:
+        raise ValueError("Unsupported type %s" % type(value))
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import sys
+import numpy as np
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.imperative import base
+__all__ = ['PyLayer']
+class PyLayer(core.Layer):
+    def __init__(self):
+        pass
+    def __call__(self, inputs):
+        # TODO(panyx0718): Support declarative mode as well.
+        assert base.enabled()
+        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            inputs = [inputs]
+        var_inputs = []
+        for x in inputs:
+            py_var = base.to_variable(x)
+            var_inputs.append(py_var)
+        outputs = self.forward(var_inputs)
+        return outputs
+    def forward(self, inputs):
+        return []
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -145,7 +145,7 @@ def save_vars(executor,
            prog = fluid.default_main_program()
            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate = name_has_fc)
            # All variables in `main_program` whose name includes "fc" will be saved.
            # And variables are going to be saved separately.
@@ -369,7 +369,7 @@ def load_vars(executor,
            prog = fluid.default_main_program()
            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate=name_has_fc)
            # All variables in `main_program` whose name includes "fc" will be loaded.
            # And all the variables are supposed to have been saved in differnet files.

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import copy
 import itertools
 import six
+import sys
+import numpy as np
 from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
+from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -46,23 +49,21 @@ class LayerHelper(object):
    def startup_program(self):
        return default_startup_program()
+    def to_variable(self, x):
+        return base.to_variable(x, self.main_program.current_block())
    def append_op(self, *args, **kwargs):
        return self.main_program.current_block().append_op(*args, **kwargs)
    def multiple_input(self, input_param_name='input'):
        inputs = self.kwargs.get(input_param_name, [])
-        type_error = TypeError(
+        ret = []
-            "Input of {0} layer should be Variable or sequence of Variable".
+        if isinstance(inputs, list) or isinstance(inputs, tuple):
-            format(self.layer_type))
+            for inp in inputs:
-        if isinstance(inputs, Variable):
+                ret.append(self.to_variable(inp))
-            inputs = [inputs]
-        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            raise type_error
        else:
-            for each in inputs:
+            ret.append(self.to_variable(inputs))
-                if not isinstance(each, Variable):
+        return ret
-                    raise type_error
-        return inputs
    def input(self, input_param_name='input'):
        inputs = self.multiple_input(input_param_name)

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -717,8 +717,9 @@ class While(object):
        out_vars = []
        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
+            inner_var = parent_block._find_var_recursive(inner_out_name)
-                out_vars.append(parent_block.var(inner_out_name))
+            if inner_var:
+                out_vars.append(inner_var)
        step_scope = parent_block.create_var(
            type=core.VarDesc.VarType.STEP_SCOPES)
@@ -1264,10 +1265,11 @@ class ConditionalBlock(object):
            if each_name not in input_set
        ]
-        out_list = [
+        out_list = []
-            parent_block.var(var_name) for var_name in parent_block.vars
+        for inner_out_name in intermediate:
-            if var_name in intermediate
+            inner_var = parent_block._find_var_recursive(inner_out_name)
-        ]
+            if inner_var:
+                out_list.append(inner_var)
        step_scope = parent_block.create_var(
            type=core.VarDesc.VarType.STEP_SCOPES)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -41,6 +41,7 @@ __all__ = [
    'crf_decoding',
    'cos_sim',
    'cross_entropy',
+    'bpr_loss',
    'square_error_cost',
    'chunk_eval',
    'sequence_conv',
@@ -1348,6 +1349,44 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
    return out
+def bpr_loss(input, label, name=None):
+    """
+    Bayesian Personalized Ranking Loss Operator.
+    This operator belongs to pairwise ranking loss. Label is the desired item.
+    The loss at a given point in one session is defined as:
+    $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+    Learn more details by reading paper <session-based recommendations with recurrent
+    neural networks>(https://arxiv.org/abs/1511.06939)
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+                                batch size and D is the number of classes.
+                                This input is not probability but logits.
+        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
+                                is a tensor<int64> with shape [N x 1].
+        name (str|None):        A name for this layer(optional). If set None, the
+                                layer will be named automatically. Default: None.
+    Returns:
+        A 2-D tensor with shape [N x 1], the bpr loss.
+    Examples:
+        .. code-block:: python
+          cost = fluid.layers.bpr_loss(input=predict, label=label)
+    """
+    helper = LayerHelper('bpr_loss', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bpr_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]})
+    return out
 def square_error_cost(input, label):
    """
    **Square error cost layer**
@@ -6623,7 +6662,8 @@ def relu(x, name=None):
    helper = LayerHelper('relu', **locals())
    dtype = helper.input_dtype(input_param_name='x')
    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
    return out

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -135,9 +135,17 @@ class ParallelExecutor(object):
            build_strategy = BuildStrategy()
        build_strategy.num_trainers = num_trainers
+        build_strategy.trainer_id = trainer_id
        main = main_program
        main = main if main else framework.default_main_program()
+        trainers_endpoints = main._trainers_endpoints
+        if num_trainers > 1 and trainers_endpoints:
+            assert num_trainers == len(
+                trainers_endpoints), "num_trainers == len(end_points)"
+            build_strategy.trainers_endpoints = trainers_endpoints
        if scope == None:
            scope = executor.global_scope()

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,14 +43,13 @@ if(APPLE)
        list(REMOVE_ITEM TEST_OPS test_desc_clone)
        list(REMOVE_ITEM TEST_OPS test_program_code)
    endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
+    message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
    # this op is not support on mac
    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
    # TODO: add the unitest back when it fixed
    list(REMOVE_ITEM TEST_OPS test_detection_map_op)
    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
    list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
-    list(REMOVE_ITEM TEST_OPS test_gradient_clip)
 endif()
 if(NOT WITH_MKLML)
    # this op is not support on openblas

--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-        if args.is_dist:
+        if args.update_method == "pserver":
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
        def get_data():
            origin_batch = next(reader_generator)
-            if args.is_dist and args.use_reader_alloc:
+            if args.update_method == "pserver" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:

--- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+class TestBprLossOp1(OpTest):
+    """Test BprLoss with discrete one-hot labels.
+    """
+    def setUp(self):
+        self.op_type = "bpr_loss"
+        batch_size = 40
+        class_num = 5
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        bpr_loss_result = []
+        for i in range(batch_size):
+            sum = 0.0
+            for j in range(class_num):
+                if j == label[i][0]:
+                    continue
+                sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]])))
+            bpr_loss_result.append(-sum / (class_num - 1))
+        bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": bpr_loss}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
        self.activation = 'identity'
+class TestIdentityActivation(TestConv2dFusionOp):
+    def init_activation(self):
+        self.activation = 'identity'
+        self.add_residual_data = False
 class TestWithGroup(TestConv2dFusionOp):
    def init_group(self):
        self.groups = 3

--- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
+class TestMKLDNN(TestConv3dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+class TestMKLDNNCase1(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+class TestMKLDNNGroup1(TestWithGroup1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+class TestMKLDNNGroup2(TestWithGroup2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+class TestMKLDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py