diff --git a/README.md b/README.md
index 56d6c10c642787836abb55cb2974bda0b8d22da4..c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    We appreciate your contributions!
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 2722ea078ebdf9a88fe2286fb4050fca652ffb7f..fd4cf92d85d5daa891d602d4365122c870920bba 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -66,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 6b526f0103ad3c530c06a68757cf89293f4fb84b..595454e90b9cd713fd2baed24538cf5fbc93934a 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 291d8ffc3c3334c2836e1651a8997984bba084e1..a99cf53b410433c6e4b8a19821779f28c25e678f 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
   CheckInit();
   for (size_t i = 0; i < use_slots_.size(); ++i) {
     if (name == use_slots_[i]) {
-      if (use_slots_is_dense_[i]) {
-        feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
-      } else {
-        feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
-      }
+      feed_vec_[i] = var->GetMutable<LoDTensor>();
     }
   }
 }
@@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
           "the data, please check if the data contains unresolvable "
           "characters.\nplease check this error line: %s",
           str);
+
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
         if ((*instance)[idx].GetType()[0] == 'f') {  // float
@@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec(
       (*ins_vec)[i].InitOffset();
     }
   }
+
   for (size_t i = 0; i < instance.size(); ++i) {
     (*ins_vec)[i].AddIns(instance[i]);
   }
@@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec(
     const auto& type = ins_vec[i].GetType();
     const auto& offset = ins_vec[i].GetOffset();
     int total_instance = static_cast<int>(offset.back());
+
     if (type[0] == 'f') {  // float
       const auto& feasign = ins_vec[i].GetFloatData();
-      if (feed_vec_[i].IsDense()) {
-        int size_in_each_batch = total_instance / batch_size_;
-        float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
-            {batch_size_, size_in_each_batch}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-      } else {
-        float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
-            {total_instance, 1}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-        LoD data_lod{offset};
-        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
-      }
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
     } else if (type[0] == 'u') {  // uint64
       // no uint64_t type in paddlepaddle
       const auto& feasign = ins_vec[i].GetUint64Data();
-      if (feed_vec_[i].IsDense()) {
-        int size_in_each_batch = total_instance / batch_size_;
-        int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
-            {batch_size_, size_in_each_batch}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
-      } else {
-        int64_t* tensor_ptr =
-            feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
-                {total_instance, 1}, platform::CPUPlace());
-        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
-        LoD data_lod{offset};
-        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
-      }
+      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+    }
+
+    LoD data_lod{offset};
+    feed_vec_[i]->set_lod(data_lod);
+    if (use_slots_is_dense_[i]) {
+      int dim = total_instance / batch_size_;
+      feed_vec_[i]->Resize({batch_size_, dim});
     }
   }
 }
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index a7f8d1d31752af200145bc7934e7880910338e9d..7cc6919703680c359b89075777e97676f5253c57 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -30,35 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// Pack Tensor type and LoDTensor type into MixTensor type, in order
-// to record either Tensor or LoDTensor information at the same time.
-class MixTensor {
- public:
-  MixTensor() {}
-  explicit MixTensor(LoDTensor* lodtensor) {
-    is_dense_ = false;
-    lodtensor_ = lodtensor;
-  }
-  explicit MixTensor(Tensor* tensor) {
-    is_dense_ = true;
-    tensor_ = tensor;
-  }
-  bool IsDense() { return is_dense_; }
-  LoDTensor* GetLoDTensor() {
-    PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
-    return lodtensor_;
-  }
-  Tensor* GetTensor() {
-    PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
-    return tensor_;
-  }
-
- private:
-  bool is_dense_;
-  LoDTensor* lodtensor_;
-  Tensor* tensor_;
-};
-
 // DataFeed is the base virtual class for all ohther DataFeeds.
 // It is used to read files and parse the data for subsequent trainer.
 // Example:
@@ -133,7 +104,7 @@ class DataFeed {
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 
   // The data read by DataFeed will be stored here
-  std::vector<MixTensor> feed_vec_;
+  std::vector<LoDTensor*> feed_vec_;
 
   // the batch size defined by user
   int default_batch_size_;
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
index 3974f8dbadf332801a822618d77f140db440b29d..b3e969871592394a7ac2fdeab8495677e7bba070 100644
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
       const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
       std::map<std::string, const paddle::framework::LoDTensor*>
           lodtensor_targets;
-      std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
       for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
         const auto& slot = multi_slot_desc.slots(i);
         if (slot.is_used()) {
           const auto& name = slot.name();
           readers[idx]->AddFeedVar(scope->Var(name), name);
-          if (slot.is_dense()) {
-            tensor_targets[name] =
-                &scope->FindVar(name)->Get<paddle::framework::Tensor>();
-          } else {
-            lodtensor_targets[name] =
-                &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
-          }
+          lodtensor_targets[name] =
+              &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
         }
       }
       readers[idx]->Start();
@@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
           if (!slot.is_used()) {
             continue;
           }
+          const paddle::framework::LoDTensor* tens =
+              lodtensor_targets[slot.name()];
           if (slot.is_dense()) {  // dense branch
-            const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
             if (slot.type() == "uint64") {
               const int64_t* data = tens->data<int64_t>();
               int batch_size = tens->dims()[0];
@@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
               PADDLE_THROW("Error type in proto file.");
             }
           } else {  // sparse branch
-            const paddle::framework::LoDTensor* tens =
-                lodtensor_targets[slot.name()];
             if (slot.type() == "uint64") {
               const int64_t* data = tens->data<int64_t>();
               for (size_t i = 0; i < tens->NumElements(); ++i) {
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57..5467f6d1b23c0058f06387e3da97c4193dd5ca6c 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   auto out_format =
       platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
-  void* in_data = GetDataFromTensor(in, in_type);
-
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-  auto out_memory =
-      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-  platform::Reorder(in_memory, out_memory);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
 
+    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
   out->set_layout(out_layout);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
   out->set_format(memory::format::format_undef);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 93288936fea1fae897dc26e6d8850da612960333..2f76cb714fcf7297c17552bcff965c6039253a2e 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
-    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    if(WITH_DISTRIBUTE)
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+    else()
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor)
+    endif()
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
     nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 
 else()
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
              variable_visitor)
-    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    if(WITH_DISTRIBUTE)
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor sendrecvop_grpc)
+    else()
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor)
+    endif()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
     cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 1e1b945f63cf480308c05ffc0f9a3b9f0b0da02b..d8526b3f2492992c5c0f6f5e0a85cffca7398700 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -58,6 +58,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       }
     }
 
+    CollectiveContext *context = CollectiveContext::GetInstance();
+    context->endpoints_ = strategy_.trainers_endpoints_;
+    context->trainer_id_ = strategy_.trainer_id_;
+    PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
+    if (strategy_.trainer_id_ > 0) {
+      PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
+                         strategy_.trainers_endpoints_.size(),
+                     "trainer_id_ < endpoints_ size");
+    }
+    VLOG(1) << "CollectiveContext:" << context->String();
+
     // Convert graph to run on multi-devices.
     auto multi_devices_pass = AppendPass("multi_devices_pass");
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
@@ -135,16 +146,16 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
     } else if (pass->Type() == "sequential_execution_pass") {
-      VLOG(1) << "set enable_sequential_execution:"
-              << enable_sequential_execution_;
+      LOG(INFO) << "set enable_sequential_execution:"
+                << enable_sequential_execution_;
 
       pass->Erase(kAllOpDescs);
       pass->Set<const std::vector<OpDesc *>>(
           kAllOpDescs,
           new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "all_reduce_deps_pass") {
-      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
-              << ", num_trainers:" << num_trainers_;
+      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+                << ", num_trainers:" << num_trainers_;
 
       pass->Erase(kAllOpDescs);
       pass->Set<const std::vector<OpDesc *>>(
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 9f0a25912886cea7a1f287125cfe8612e4b336eb..c97be169575f578dfd18a6290230d1b3f3bd7596 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
   bool fuse_broadcast_op_{false};
 
   int num_trainers_{1};
+  int trainer_id_{0};
+  std::vector<std::string> trainers_endpoints_;
   bool remove_unnecessary_lock_{false};
 
   // NOTE:
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index bd6153c0c736f6e32378eebcbf6c4d7e402c9b42..2e5256fbd49a3f8c72840cd55dada4301cb04eb9 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,7 +53,7 @@ struct ReduceLoDTensor {
   }
 };
 
-inline void GatherSelectedRows(
+inline void GatherLocalSelectedRows(
     const std::vector<const SelectedRows *> &src_selecte_rows_,
     const std::vector<platform::Place> &in_places,
     const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index c9f1107aeab5a21d46e828308cfcb2dde827cba6..cb864848b938e249ecd9d09e2a02f683959ce413 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,6 +16,12 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#endif
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(
@@ -26,6 +32,112 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+std::once_flag CollectiveContext::init_flag_;
+std::unique_ptr<CollectiveContext> CollectiveContext::context_;
+
+static inline std::string GetRemoteVarName(const std::string &var_name,
+                                           int trainer_id) {
+  return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
+}
+
+void ReduceOpHandle::Wait(
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
+  // TODO(gongwb): use event wait?
+  for (auto &dev_ctx : dev_ctxes) {
+    dev_ctx.second->Wait();
+  }
+}
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+template <typename DevCtx, typename DataType>
+void ReduceOpHandle::GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selected_rows,
+    const std::vector<platform::Place> &in_places,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+    VarHandle *out_var_handle, const platform::Place &out_place,
+    SelectedRows *dst_selected_rows) {
+  const CollectiveContext &collective_context =
+      *CollectiveContext::GetInstance();
+
+  // 1. gather local selected rows, merge them
+  std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp";
+  auto scope = local_scopes_.at(out_var_handle->scope_idx_);
+  auto gathered_var_mid = scope->Var(gathered_var_name);
+  auto gathered_select_rows =
+      gathered_var_mid->GetMutable<framework::SelectedRows>();
+  GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
+                          gathered_select_rows);
+  // FIXME(gongwb): remove this Wait.
+  Wait(dev_ctxes);
+
+  // merge them
+  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
+  std::string merged_var_name =
+      GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_);
+  auto merged_select_rows =
+      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
+  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
+  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
+
+  // 2. start collective server if it doesn't exist
+  operators::distributed::CollectiveServer *server =
+      operators::distributed::CollectiveServer::GetInstance(
+          collective_context.endpoints_[collective_context.trainer_id_],
+          collective_context.endpoints_.size() - 1);
+
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar(merged_var_name,
+                          operators::distributed::kRequestGetMonomerVariable,
+                          scope, merged_dev_ctx);
+
+  // 3. gather them from all remote nodes.
+  std::vector<const SelectedRows *> remote;
+  operators::distributed::CollectiveClient *client =
+      operators::distributed::CollectiveClient::GetInstance();
+
+  std::vector<operators::distributed::RemoteVar> vars;
+  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
+    if (i == (unsigned)collective_context.trainer_id_) continue;
+
+    operators::distributed::RemoteVar var;
+    var.trainer_id_ = i;
+    var.var_name_ = GetRemoteVarName(out_var_handle->name_, i);
+    var.ep_ = collective_context.endpoints_[i];
+
+    vars.push_back(var);
+    VLOG(4) << "gather from:" << var.String();
+  }
+
+  // erase gathered vars
+  merged_dev_ctx->Wait();
+  scope->EraseVars(std::vector<std::string>{gathered_var_name});
+
+  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
+  PADDLE_ENFORCE(remote.size() == vars.size());
+
+  // 4. merged local selected rows.
+  std::vector<const SelectedRows *> all;
+  all.resize(collective_context.endpoints_.size());
+  for (auto v : vars) {
+    all[v.trainer_id_] =
+        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
+  }
+  all[collective_context.trainer_id_] = merged_select_rows;
+
+  merge_func(*merged_dev_ctx, all, dst_selected_rows);
+
+  rpc_server->WaitVarBarrier(merged_var_name);
+  rpc_server->ClearVar(merged_var_name);
+
+  // 5. clear mid vars
+  std::vector<std::string> tmp_vars{merged_var_name};
+  for (auto r : vars) {
+    tmp_vars.push_back(r.var_name_);
+  }
+  scope->EraseVars(tmp_vars);
+}
+#endif
+
 void ReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
@@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() {
     this->RunAndRecordEvent([&] {
       std::vector<const SelectedRows *> in_selected_rows =
           GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                         out_var->GetMutable<framework::SelectedRows>());
+
+      const CollectiveContext &collective_context =
+          *CollectiveContext::GetInstance();
+      VLOG(10) << "GatherSelectedRows CollectiveContext:"
+               << collective_context.String();
+
+      // TODO(gongwb): add cpu support
+      if (collective_context.endpoints_.size() <= 1 ||
+          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+        GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                                t_out_p,
+                                out_var->GetMutable<framework::SelectedRows>());
+        return;
+      }
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, float>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else if (framework::IsType<const double>(
+                     in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, double>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else {
+        PADDLE_ENFORCE(false,
+                       "only support double or float when gahter SelectedRows");
+      }
+#endif
     });
   } else {
     std::vector<const LoDTensor *> lod_tensors =
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 846839029ca65be1bdeac2f6ea497db07a01b6cf..5491f00f45e9d48c5eb7455396ac51801f2c40ab 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -30,6 +30,32 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct CollectiveContext {
+  std::vector<std::string> endpoints_;
+  int trainer_id_{0};
+
+  std::string String() const {
+    std::stringstream ss;
+    ss << "endpoints_:";
+    for (auto e : endpoints_) {
+      ss << e << ",";
+    }
+
+    ss << "trainer_id_:" << trainer_id_;
+
+    return ss.str();
+  }
+
+  static CollectiveContext *GetInstance() {
+    std::call_once(init_flag_,
+                   [&]() { context_.reset(new CollectiveContext()); });
+    return context_.get();
+  }
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveContext> context_;
+};
 
 struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
@@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+  template <typename DevCtx, typename DataType>
+  void GatherSelectedRows(
+      const std::vector<const SelectedRows *> &src_selecte_rows_,
+      const std::vector<platform::Place> &in_places,
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+      VarHandle *out_var_handle, const platform::Place &out_place,
+      SelectedRows *dst_selecte_rows);
+#endif
+
+  void Wait(
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
+
   template <typename T>
   std::vector<const T *> GetInputValues(
       const std::vector<VarHandle *> &in_var_handles,
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 4e4001e979fdd0774779fa288402c7847af90637..3d5351161554f46539c54ce3efbfc2d6a36fc419 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() {
   static unsigned concurrency_cap = std::thread::hardware_concurrency();
   int thread_id = this->thread_id_;
 
-  if (thread_id < concurrency_cap) {
+  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
     unsigned proc = thread_id;
 
     cpu_set_t mask;
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf67d8de62c5551f12ea786e49190549..6338be75a4b1d3c4caf7a6f7add4d05fec690340 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
   return tensor;
 }
 
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 7f504bfd232862c014cb59b6e8301eec74e0351f..031f8e01aa6128b803dcbfb990778e87d4fafc13 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                             size_t index);
 
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
index 449cc78be15bcd2575ce2e6846b41e475f8921f6..d4a701e0b173a96d8605dff308fee7007a0ecc0c 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -46,14 +46,16 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
 
+  std::string type = is_conv3d() ? "conv3d" : "conv2d";
+
   GraphPatternDetector gpd;
   auto* conv_input =
       gpd.mutable_pattern()
           ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
           ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
+          ->assert_is_op_input(type, "Input");
   patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input);
+  conv_bias_pattern(conv_input, is_conv3d());
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -109,7 +111,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
       desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
       desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType("conv2d");
+      desc.SetType(type);
 
       for (auto& attr : conv->Op()->GetAttrMap()) {
         desc.SetAttr(attr.first, attr.second);
@@ -135,3 +137,5 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
+REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
index 5775b83b88730ec298c421a15f5c0b83c27b0750..f3ad9f1c2bf14db418629e0c607e2510f01908b8 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -26,11 +26,19 @@ namespace ir {
 class ConvBiasFusePass : public FusePassBase {
  public:
   virtual ~ConvBiasFusePass() {}
+  virtual bool is_conv3d() const { return false; }
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
+/*
+* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+*/
+class Conv3DBiasFusePass : public ConvBiasFusePass {
+ public:
+  bool is_conv3d() const override { return true; }
+};
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index fc91564bbaecf7b1725908fc1eb8b1e4d2e20d32..8679118fe28b1c68aea30caf711441823b5255c0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
     switch (role_id) {
       case _INT(OpRole::kForward):
         if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
+                     << op->Type();
         }
         break;
       case _INT(OpRole::kBackward):
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 258182b25a16d9135f55cfc300e2602d14f26d73..0118019df2f779a6409365555b530ae3b6d3971f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
 }
 
 PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
+    paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
+  std::string type = is_conv3d ? "conv3d" : "conv2d";
   // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  conv_input->assert_is_op_input(type, "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
   auto *eltiwse_op =
       pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
   // Create variables
@@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()(
   auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                               ->AsInput()
                               ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
+                              ->assert_is_op_input(type, "Filter");
   // intermediate variable, will be removed in the IR after fuse.
   auto *conv_out_var = pattern->NewNode(conv_out_repr())
                            ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_only_output_of_op(type)
                            ->assert_is_op_input("elementwise_add");
   // Bias stored in elementwise_add
   auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c12b9503fd817757ec8d1e988be3e449fc63c6ff..d044802f22d02372e0ddb72c6fd702aebf2f82c3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
 struct ConvBias : public PatternBase {
   ConvBias(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input);
+  PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
   // declare operator node's name
   PATTERN_DECL_NODE(conv);
   PATTERN_DECL_NODE(eltwise);
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 6d8f020918d4e56fa7f125a659f7f8511ca067ca..57cc98e2ca0175848aa62c62c8ad3b20594b3bde 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (n->RuntimeHasAttr("is_test")) {
+      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
         op->SetAttr("is_test", true);
       } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                  end(op_list)) {
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index d9a68c7f1dd2a0dca5204719c4ce6cd9d68292a2..9696441a21661db89146c448742a992d1f7df022 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
       auto* op = node->Op();
       auto op_name = boost::get<std::string>(op->GetAttr("name"));
       if (op_name == "conv3") {
-        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
+        ASSERT_FALSE(op->HasAttr("is_test"));
       } else {
-        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
+        ASSERT_TRUE(op->HasAttr("is_test"));
         EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 1cf1315d3d3059261d84d0e8795a75df4deae005..951fcb066ce759ebfec0182e1e9dca887e343170 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
@@ -21,9 +22,19 @@ namespace ir {
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
-      n->Op()->SetAttr("use_mkldnn", true);
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
+      }
     }
   }
   return graph;
@@ -33,5 +44,5 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(mkldnn_placement_pass,
-              paddle::framework::ir::MKLDNNPlacementPass);
+REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
+    .RequirePassAttr("mkldnn_enabled_op_types");
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 7a88cb2b681c1aa5e1b75481b1aba66a125a1d9c..eac67108e2106e986cbe1255a64c956153bc5560 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,28 +30,6 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
   return std::unique_ptr<Node>(new Node(name, type));
 }
 
-bool Node::RuntimeHasAttr(const std::string &name) const {
-  if (Op()->HasAttr(name)) {
-    return true;
-  } else {
-    auto &op_info = OpInfoMap::Instance();
-    auto op_type = Op()->Type();
-    if (op_info.Has(op_type)) {
-      auto op_info_ptr = op_info.Get(op_type);
-      if (op_info_ptr.HasOpProtoAndChecker()) {
-        const proto::OpProto &proto = op_info_ptr.Proto();
-        for (int i = 0; i != proto.attrs_size(); ++i) {
-          const proto::OpProto::Attr &attr = proto.attrs(i);
-          if (attr.name() == name) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 1044a96430f060b750580ea0b225787ba6ebd2a0..d2a393b3f19e9aab79098757dae663d030b0fa2b 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,18 +108,6 @@ class Node {
            Name().find(ir::Node::kControlDepVarName) != std::string::npos;
   }
 
-  // RuntimeHasAttr is different with HasAttr now.
-  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
-  // thus, if stored program_desc_ are old which don't have an attr, a new
-  // library which adds the attr already will fail on this function.
-  // Details:
-  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
-  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
-  // problem.
-  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
-  // RuntimeHasAttr.
-  bool RuntimeHasAttr(const std::string& name) const;
-
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index e8ecd90502933a049cc8f886212579fc061d44ff..dde642764fa5dfce11edcef51ad1be11be331fbc 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string &param_name,
   this->outputs_[param_name] = args;
 }
 
+bool OpDesc::HasProtoAttr(const std::string &name) const {
+  auto &op_info = OpInfoMap::Instance();
+  if (op_info.Has(desc_.type())) {
+    auto op_info_ptr = op_info.Get(desc_.type());
+    if (op_info_ptr.HasOpProtoAndChecker()) {
+      const proto::OpProto &proto = op_info_ptr.Proto();
+      for (int i = 0; i != proto.attrs_size(); ++i) {
+        const proto::OpProto::Attr &attr = proto.attrs(i);
+        if (attr.name() == name) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 30c8a26c3d2f0068674aa70b4ff875a2f73c1dca..e8debec7f13706b7fc5a4882d237ee2257e53b7e 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,8 @@ class OpDesc {
     return attrs_.find(name) != attrs_.end();
   }
 
+  bool HasProtoAttr(const std::string &name) const;
+
   proto::AttrType GetAttrType(const std::string &name) const;
 
   std::vector<std::string> AttrNames() const;
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..373d292b443b7651b785a52a6986b0a0be58ad61
--- /dev/null
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(engine SRCS engine.cc)
diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7ab0e5918281579728ef48d1517be2cd530af7
--- /dev/null
+++ b/paddle/fluid/imperative/engine.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/engine.h"
+
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag init_engine;
+static Engine* engine;
+
+class DummyEngine : public Engine {
+ public:
+  void Enqueue(Runnable* runnable) override {
+    queued_runnables_.push_back(runnable);
+  }
+
+  size_t Size() const override { return queued_runnables_.size(); }
+
+  void Sync() override {
+    for (Runnable* l : queued_runnables_) {
+      LOG(INFO) << "running " << reinterpret_cast<void*>(l);
+    }
+    queued_runnables_.clear();
+  }
+
+ private:
+  std::vector<Runnable*> queued_runnables_;
+};
+
+Engine* GetEngine() {
+  std::call_once(init_engine, []() { engine = new DummyEngine(); });
+  return engine;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1dfa5bda38d0c419aa4ccbea77b32eb7e0d5b23
--- /dev/null
+++ b/paddle/fluid/imperative/engine.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace paddle {
+namespace imperative {
+
+struct Runnable {};
+
+class Engine {
+ public:
+  virtual ~Engine() {}
+
+  virtual void Enqueue(Runnable* runnable) = 0;
+
+  virtual size_t Size() const = 0;
+
+  virtual void Sync() = 0;
+};
+
+Engine* GetEngine();
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..612503768079472ba233ee3fcd43a47fdba9a0cc
--- /dev/null
+++ b/paddle/fluid/imperative/layer.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/layer.h"
+#include <deque>
+#include <limits>
+#include <map>
+#include <random>
+#include <utility>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace imperative {
+
+using framework::Variable;
+
+void AddTo(Variable* src, Variable* dst) {
+  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
+                 dst_tensor->numel(), src_tensor->numel());
+  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
+  const float* src_data = src_tensor->data<float>();
+  for (size_t i = 0; i < src_tensor->numel(); ++i) {
+    dst_data[i] += src_data[i];
+  }
+}
+
+class Autograd {
+ public:
+  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+
+  void RunBackward(VarBase* var) {
+    PADDLE_ENFORCE(var->pre_op_->op_desc_);
+    // TODO(panyx0718): Only create for vars that "require_grad"
+    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+
+    std::deque<OpBase*> ready;
+    ready.push_back(var->pre_op_);
+
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
+
+    while (!ready.empty()) {
+      OpBase* ready_op = ready.front();
+      ready.pop_front();
+      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
+
+      for (size_t i = 0; i < input_grads.size(); ++i) {
+        if (!input_grads[i]) continue;
+        OpBase* pre_op = ready_op->pre_ops_->at(i);
+        if (!pre_op) continue;
+
+        dep_counts[pre_op] -= 1;
+        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+        bool pre_op_ready = dep_counts[pre_op] == 0;
+        if (pre_op_ready) {
+          ready.push_back(pre_op);
+        }
+      }
+    }
+  }
+
+ private:
+  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
+    std::map<OpBase*, int> ret;
+
+    std::deque<OpBase*> queue;
+    queue.push_back(op);
+    std::unordered_set<OpBase*> visited;
+    visited.insert(op);
+    while (!queue.empty()) {
+      OpBase* candidate = queue.front();
+      queue.pop_front();
+      for (OpBase* pre_op : *(candidate->pre_ops_)) {
+        if (!pre_op) continue;
+        if (visited.find(pre_op) == visited.end()) {
+          visited.insert(pre_op);
+          queue.push_back(pre_op);
+        }
+        ret[pre_op] += 1;
+      }
+    }
+
+    return ret;
+  }
+
+  framework::Scope* scope_;
+};
+
+framework::Variable* CreateVariable(const std::string& name,
+                                    const framework::DDim& dim, float val,
+                                    framework::Scope* scope,
+                                    bool random_name = true) {
+  std::string varname = name;
+  if (random_name) {
+    std::mt19937 rng;
+    rng.seed(std::random_device()());
+    std::uniform_int_distribution<std::mt19937::result_type> dist6(
+        1, std::numeric_limits<int>::max());
+    int id = dist6(rng);
+    varname = string::Sprintf("%s@%d", varname, id);
+  }
+
+  VLOG(3) << "creating var " << varname;
+  framework::Variable* var = scope->Var(varname);
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+
+  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
+  std::fill(data, data + tensor->numel(), val);
+  return var;
+}
+
+framework::LoDTensor& VarBase::Grad() {
+  VLOG(3) << "get var grad " << var_desc_->Name();
+  return *grads_->GetMutable<framework::LoDTensor>();
+}
+
+void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
+  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
+          << grad->Get<framework::LoDTensor>().data<float>()[0];
+  if (!grads_) {
+    grads_ =
+        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
+                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+  }
+  AddTo(grad, grads_);
+  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
+          << grads_->Get<framework::LoDTensor>().data<float>()[0];
+}
+
+std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
+  VLOG(3) << "op grad " << grad_op_desc_->Type();
+
+  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
+    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
+      // grad op inputs can be forward inputs, so not in grad_to_var.
+      continue;
+    }
+    VLOG(3) << "op grad in var " << grad_invar;
+    block_->FindRecursiveOrCreateVar(grad_invar);
+    framework::Variable* var = scope->Var(grad_invar);
+    const std::string& invar = grad_to_var_->at(grad_invar);
+    for (VarBase* varbase : *output_vars_) {
+      // Use the accumulated grads_ by sharing the input with grads_.
+      if (varbase->var_desc_->Name() == invar) {
+        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
+            varbase->grads_->Get<framework::LoDTensor>());
+        break;
+      }
+    }
+  }
+
+  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+    VLOG(3) << "grad outvar " << outvar;
+    block_->FindRecursiveOrCreateVar(outvar);
+    framework::Variable* var = scope->Var(outvar);
+    if (!var->IsInitialized()) {
+      framework::VarDesc* var_desc = block_->FindVar(outvar);
+      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        var->GetMutable<framework::LoDTensor>();
+      } else {
+        LOG(ERROR) << "tracer doesn't support yet";
+      }
+    }
+  }
+  grad_op_desc_->InferShape(*block_);
+  grad_op_desc_->InferVarType(block_);
+  std::unique_ptr<framework::OperatorBase> opbase =
+      framework::OpRegistry::CreateOp(*grad_op_desc_);
+
+  opbase->Run(*scope, platform::CPUPlace());
+
+  // `ret` matches exactly with `input_vars_` of forward op.
+  std::vector<Variable*> ret;
+  for (size_t i = 0; i < input_vars_->size(); ++i) {
+    bool found = false;
+    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+      Variable* var = scope->FindVar(outvar);
+      VarBase* origin_var = (*input_vars_)[i];
+      std::string orig_var = grad_to_var_->at(outvar);
+      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
+      origin_var->ApplyGrad(scope, var);
+      found = true;
+      ret.push_back(var);
+      // TODO(panyx0718): There might be another outvar with the same name.
+      // In that case, it doesn't matter the first one or the second one is
+      // used.
+      break;
+    }
+    if (!found) {
+      ret.push_back(nullptr);
+    }
+  }
+  return ret;
+}
+
+void VarBase::RunBackward(framework::Scope* scope) {
+  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
+                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
+                          false);
+  if (!pre_op_) return;
+  Autograd(scope).RunBackward(this);
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a71ca83d21ed2595ddbe684300a46c05fed3af
--- /dev/null
+++ b/paddle/fluid/imperative/layer.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace imperative {
+
+class OpBase;
+
+class VarBase {
+ public:
+  VarBase()
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(nullptr),
+        grads_(nullptr) {}
+
+  virtual ~VarBase() {}
+
+  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+
+  void RunBackward(framework::Scope* scope);
+
+  framework::LoDTensor& Grad();
+
+  OpBase* pre_op_;
+  int pre_op_out_idx_;
+
+  framework::VarDesc* var_desc_;
+  framework::Variable* var_;
+  framework::Variable* grads_;
+};
+
+class OpBase {
+ public:
+  OpBase()
+      : input_vars_(new std::vector<VarBase*>()),
+        output_vars_(new std::vector<VarBase*>()),
+        pre_ops_(new std::vector<OpBase*>()),
+        pre_ops_out_idx_(new std::vector<int>()),
+        op_desc_(nullptr),
+        grad_op_desc_(nullptr) {}
+
+  virtual ~OpBase() {
+    delete input_vars_;
+    delete output_vars_;
+
+    delete pre_ops_;
+    delete pre_ops_out_idx_;
+
+    if (grad_op_desc_) delete grad_op_desc_;
+    if (grad_to_var_) delete grad_to_var_;
+  }
+
+  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+
+  std::vector<VarBase*>* input_vars_;
+  std::vector<VarBase*>* output_vars_;
+  std::vector<OpBase*>* pre_ops_;
+  std::vector<int>* pre_ops_out_idx_;
+  framework::OpDesc* op_desc_;
+
+  framework::OpDesc* grad_op_desc_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+  framework::BlockDesc* block_;
+};
+
+class Layer {
+ public:
+  virtual ~Layer() {}
+
+  virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
+    return vars;
+  }
+
+  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f64f9e72c4a23528948183b909d65e90783a4463
--- /dev/null
+++ b/paddle/fluid/imperative/tracer.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace imperative {}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..433d07c0e5aa0986ab1e9fe349ef865d2851c0c0
--- /dev/null
+++ b/paddle/fluid/imperative/tracer.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/engine.h"
+#include "paddle/fluid/imperative/layer.h"
+
+namespace paddle {
+namespace imperative {
+
+void CreateGradOp(const framework::OpDesc& op_desc,
+                  const std::unordered_set<std::string>& no_grad_set,
+                  const std::vector<framework::BlockDesc*>& grad_sub_block,
+                  framework::OpDesc** grad_op_desc,
+                  std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+      framework::OpInfoMap::Instance()
+          .Get(op_desc.Type())
+          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  // TODO(panyx0718): Leak?
+  *grad_op_desc = grad_op_descs[0].release();
+}
+
+class Tracer {
+ public:
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+    root_scope_ = new framework::Scope();
+    scopes_[root_block_] = root_scope_;
+  }
+
+  virtual ~Tracer() { delete root_scope_; }
+
+  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
+             const std::vector<VarBase*>& outputs,
+             framework::BlockDesc* block) {
+    framework::Scope* scope = GetScope(block);
+    framework::OpDesc* op_desc = op->op_desc_;
+    VLOG(3) << "tracer tracing " << op_desc->Type();
+    op_desc->InferShape(*block);
+    op_desc->InferVarType(block);
+    std::unique_ptr<framework::OperatorBase> op_base =
+        framework::OpRegistry::CreateOp(*op_desc);
+
+    *op->input_vars_ = inputs;
+    for (VarBase* input : inputs) {
+      const std::string vname = input->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      input->var_ = var;
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      if (input->pre_op_) {
+        op->pre_ops_->push_back(input->pre_op_);
+        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
+      } else {
+        op->pre_ops_->push_back(nullptr);
+      }
+    }
+
+    *op->output_vars_ = outputs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      const std::string vname = outputs[i]->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      outputs[i]->var_ = var;
+      outputs[i]->pre_op_ = op;
+      outputs[i]->pre_op_out_idx_ = i;
+    }
+    op_base->Run(*scope, platform::CPUPlace());
+    framework::OpDesc* grad_op_desc;
+    auto grad_to_var = new std::unordered_map<std::string, std::string>();
+    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+    op->grad_op_desc_ = grad_op_desc;
+    op->grad_to_var_ = grad_to_var;
+    op->block_ = block;
+  }
+
+  framework::Scope* GetScope(framework::BlockDesc* block) {
+    if (scopes_.find(block) != scopes_.end()) {
+      return scopes_.at(block);
+    }
+    framework::BlockDesc* parent_block = block->ParentBlock();
+    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
+    framework::Scope* scope = &scopes_[parent_block]->NewScope();
+    scopes_[block] = scope;
+    return scope;
+  }
+
+ private:
+  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
+  framework::BlockDesc* root_block_;
+  framework::Scope* root_scope_;
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 21203e2d9f4e4cd22ea49ea7b6808aff07e70eff..83d411eecf6d706615243fd78cb7e4330d904fc1 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -103,6 +103,7 @@ struct Argument {
   // Model specified with program and parameters files.
   DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
 
   // The overall graph to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -115,6 +116,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                       std::vector<std::string>);
 
+  // Pass a set of op types to enable its mkldnn kernel
+  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
+                      std::unordered_set<std::string>);
+
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index fce5e1cac92064a320179243380ea02b2c5d7838..51bca8039d4531536cd7a3c39ef8a27f1a5412a1 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
     }
+    if (pass_name == "mkldnn_placement_pass") {
+      pass->Set("mkldnn_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->mkldnn_enabled_op_types()));
+    }
 
     if (pass_name == "tensorrt_subgraph_pass") {
       PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 740030c3a80e4d7e2ac47998a304be97758b95cb..b8a045c18fab54581b4d2b902be373f55ad09e8a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
              argument->model_params_path_valid()) {
     auto program =
         LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr(), place);
+                  argument->scope_ptr(), place, argument->model_from_memory());
     argument->SetMainProgram(program.release());
   } else {
     PADDLE_THROW(
@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
 
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
     const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope, const platform::Place &place) {
+    framework::Scope *scope, const platform::Place &place,
+    bool model_from_memory) {
   framework::Executor exe(place);
-  return Load(&exe, scope, program_path, params_path);
+  if (!model_from_memory) {
+    return Load(&exe, scope, program_path, params_path);
+  } else {
+    return LoadFromMemory(&exe, scope, program_path, params_path);
+  }
 }
 
 std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index 271e64fce579bc9001b1dd632576571cec949752..adbde0433fad28b006b18b47c8fd0a8946d21a98 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -24,7 +24,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Load program and parameter to memory from the disk.
+ * Load program and parameter to memory from the disk or directly from memory.
  */
 class IrGraphBuildPass : public AnalysisPass {
  public:
@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
       const platform::Place &place);
   std::unique_ptr<framework::ProgramDesc> LoadModel(
       const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope, const platform::Place &place);
+      framework::Scope *scope, const platform::Place &place,
+      bool model_from_memory);
 
   std::string model_binary_str_;
 };
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index dd75f0d9a65404908667d873786160ddaa73fa57..dcefdd92f5157dce7426f2f3e4a2bc053ce24775 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -49,10 +49,15 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
+
   use_feed_fetch_ops = other.use_feed_fetch_ops;
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
 
   if (use_gpu) {
     pass_builder_.reset(new GpuPassStrategy(
@@ -76,10 +81,16 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
+
   use_feed_fetch_ops = other.use_feed_fetch_ops;
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
+
   pass_builder_ = std::move(other.pass_builder_);
 }
 
@@ -102,4 +113,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
 }
 
+void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
+                                             size_t prog_buffer_size,
+                                             const char *param_buffer,
+                                             size_t param_buffer_size) {
+  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 391330a7c0f2dda731fe8455fdab81b276e3f272..be51e7fc1f01c5fc4a48c7f32db15bb82a5ddc07 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   argument_.SetUseGPU(config_.use_gpu);
   argument_.SetGPUDeviceId(config_.device);
+  argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir.empty()) {
     argument_.SetModelDir(config_.model_dir);
@@ -326,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
   }
 
+  if (config_.use_mkldnn_) {
+    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
+  }
+
   auto passes = config_.pass_builder()->AllPasses();
   if (!config_.enable_ir_optim) passes.clear();
   argument_.SetIrAnalysisPasses(passes);
@@ -448,20 +453,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
     return false;
   }
 
-  std::string pb_content;
-  // Read binary
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-
-  pb_content.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(pb_content.at(0)), pb_content.size());
-  fin.close();
-
   // Create ProgramDesc
   framework::proto::ProgramDesc proto;
-  proto.ParseFromString(pb_content);
+  if (!config_.model_from_memory()) {
+    std::string pb_content;
+    // Read binary
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
+                   filename);
+    fin.seekg(0, std::ios::end);
+    pb_content.resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(pb_content.at(0)), pb_content.size());
+    fin.close();
+
+    proto.ParseFromString(pb_content);
+  } else {
+    proto.ParseFromString(config_.prog_file);
+  }
   inference_program_.reset(new framework::ProgramDesc(proto));
   return true;
 }
@@ -469,6 +478,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 bool AnalysisPredictor::LoadParameters() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           "The inference program should be loaded first.");
+
   const auto &global_block = inference_program_->MutableBlock(0);
 
   // create a temporary program to load parameters.
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index a09bd1cac2aa31b2ecee9b0f77d2b777104f1161..f05b9832da55f10b34eb2df914e443a478e5a4a4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -16,6 +16,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 // Here we include some header files with relative paths, for that in deploy,
@@ -52,18 +53,26 @@ struct AnalysisConfig : public NativeConfig {
   bool use_tensorrt() const { return use_tensorrt_; }
 
   void EnableMKLDNN();
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
   bool use_mkldnn() const { return use_mkldnn_; }
+  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
+    mkldnn_enabled_op_types_ = op_list;
+  }
+
+  // Specify the memory buffer of program and parameter
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* program_buffer, size_t program_buffer_size);
+  bool model_from_memory() const { return model_from_memory_; }
 
   friend class ::paddle::AnalysisPredictor;
 
  protected:
   bool use_tensorrt_{false};
   bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
   int tensorrt_workspace_size_;
   int tensorrt_max_batchsize_;
   std::unique_ptr<PassStrategy> pass_builder_;
+  bool model_from_memory_{false};
 };
 
 // Configurations for Anakin engine.
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 825bee833bf918067497f56adebbbcaf55f892a2..bc5139a7e54eaf7133ea96ae3b36915a236a2c5e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass :
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",  //
-                                   "conv_bias_mkldnn_fuse_pass",  //
-                                   "conv_relu_mkldnn_fuse_pass",  //
+         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
+                                   "conv_bias_mkldnn_fuse_pass",    //
+                                   "conv3d_bias_mkldnn_fuse_pass",  //
+                                   "conv_relu_mkldnn_fuse_pass",    //
                                    "conv_elementwise_add_mkldnn_fuse_pass"})) {
       passes_.push_back(pass);
     }
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 31f43bfdcaafb18c611d86ef26fd9de118562799..24d15f12f9cd4a9280cd316bd727fdbccb831b9b 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const std::string& param_filename) {
+                      const std::string& param_filename,
+                      bool model_from_memory = false) {
   const framework::BlockDesc& global_block = main_program.Block(0);
 
   framework::ProgramDesc* load_program = new framework::ProgramDesc();
@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
     op->SetType("load_combine");
     op->SetOutput("Out", paramlist);
     op->SetAttr("file_path", {param_filename});
+    op->SetAttr("model_from_memory", {model_from_memory});
     op->CheckAttrs();
   }
 
@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  LoadPersistables(executor, scope, *main_program, dirname, "");
+  // model_from_memory is false in seperate parameters.
+  LoadPersistables(executor, scope, *main_program, dirname, "",
+                   false /* model_from_memory */);
   return main_program;
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
     framework::Executor* executor, framework::Scope* scope,
     const std::string& prog_filename, const std::string& param_filename) {
-  std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
+  ReadBinaryFile(prog_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  LoadPersistables(executor, scope, *main_program, "", param_filename,
+                   false /* model_from_memory */);
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer) {
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(prog_buffer));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
+
+  LoadPersistables(executor, scope, *main_program, "", param_buffer,
+                   true /* model_filename */);
   return main_program;
 }
 
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index ab492577c1476abee30d6dd1c740394391e5a93a..317ef9d93acf3af174cb44da6099425fff1418eb 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const std::string& param_filename);
+                      const std::string& param_filename,
+                      bool model_from_memory);
 
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              framework::Scope* scope,
@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer);
+
 // Save the variables from a scope to disk.
 void SaveVars(const framework::Scope& scope,
               const std::vector<std::string>& vars, const std::string& dirname,
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 343fd3f7c5aed6931fc215445c17d3ed7074368e..1d0d83d1f368f879878a4df8b2eefae0bc89423d 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
     }
 
     if (pool_type == "max") {
-      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
-      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      // Under ceil mode, the pre_pad and post_pad are used to
+      // record the the padding size. In some ceil mode cases,
+      // we do not need padding, so we initialize the two vars to 0.
+
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
       if (ceil_mode) {
         // If ceil mode is true, we will pad the appropriate size to the input.
         DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index a3a6130db7cfe75ef558dc901883c29a20088b3f..227e2ff45873fded45899146b97a7bee0c8ad763 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -188,10 +188,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_dam, profile) {
+void profile(bool use_mkldnn = false) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
+
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -209,6 +215,11 @@ TEST(Analyzer_dam, profile) {
   }
 }
 
+TEST(Analyzer_dam, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_dam, fuse_statis) {
   contrib::AnalysisConfig cfg;
@@ -222,9 +233,14 @@ TEST(Analyzer_dam, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_dam, compare) {
-  contrib::AnalysisConfig cfg;
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
   SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -233,5 +249,10 @@ TEST(Analyzer_dam, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+TEST(Analyzer_dam, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 3a5f844de3cae7eb9b6e3555c5219c6cf8ee1919..66d85420c5701b1bf308b6850465beb6d8a0b703 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
+void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
+  if (memory_load) {
+    std::string buffer_prog, buffer_param;
+    ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
+    ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
+    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  } else {
+    cfg->prog_file = FLAGS_infer_model + "/__model__";
+    cfg->param_file = FLAGS_infer_model + "/param";
+  }
   cfg->use_gpu = false;
   cfg->device = 0;
   cfg->specify_input_name = true;
@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_Chinese_ner, profile) {
+void profile(bool memory_load = false) {
   contrib::AnalysisConfig cfg;
-  SetConfig(&cfg);
+  SetConfig(&cfg, memory_load);
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
   }
 }
 
+TEST(Analyzer_Chinese_ner, profile) { profile(); }
+
+TEST(Analyzer_Chinese_ner, profile_memory_load) {
+  profile(true /* memory_load */);
+}
+
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
   contrib::AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index 4231eef7220735d0b80eb1adc951c55ff7378f1b..7046bce303e2bd46197ab512ae273500b9af88bf 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
   os << GenSpaces(num_spaces)
      << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
-  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
   os << GenSpaces(num_spaces)
@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
   os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
   num_spaces++;
   os << *reinterpret_cast<const NativeConfig *>(&config);
+  if (!config.model_from_memory()) {
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  } else {
+    os << GenSpaces(num_spaces)
+       << "prog_file and param_file: load from memory \n";
+  }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
      << "\n";
   os << GenSpaces(num_spaces)
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 2104e4ac7222258ee025bd5acd60b1db251df654..cfb80fe6ec11a55a887c7552ec4e6a8a0c6a2fce 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,2 +1,7 @@
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
+cc_binary(visualizer SRCS visualizer.cc DEPS analysis
+    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
+if(WIN32)
+  target_link_libraries(visualizer shlwapi)
+endif(WIN32)
diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..040b6476fb4febc5ca1912c8db72dc63c3bced08
--- /dev/null
+++ b/paddle/fluid/inference/utils/visualizer.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/visualizer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <fstream>
+#include <memory>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/platform/init.h"
+
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(model_program_path, "", "model program path");
+DEFINE_string(model_params_path, "", "model params path");
+
+USE_PASS(graph_viz_pass);
+USE_PASS(graph_to_program_pass);
+
+using paddle::inference::analysis::Argument;
+
+namespace paddle {
+namespace inference {
+namespace utils {
+
+void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
+
+bool Visualizer::Run() {
+  paddle::framework::InitDevices(false);
+  paddle::inference::analysis::Analyzer().Run(argument_);
+
+  return true;
+}
+
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
+
+// Generate a dot file describing the structure of graph.
+// To use this tool, run command: ./visualizer [options...]
+// Options:
+//     --model_dir: the directory of model
+//     --model_program_path: the path of program
+//     --model_params_path: the path of params
+int main(int argc, char *argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+
+  paddle::inference::analysis::Argument argument;
+  argument.SetUseGPU(false);
+  argument.SetUseTensorRT(false);
+
+  if (FLAGS_model_dir.empty()) {
+    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
+      LOG(ERROR) << "Please set model_dir"
+                    " or model_program_path and model_params_path";
+      return -1;
+    } else {
+      argument.SetModelProgramPath(FLAGS_model_program_path);
+      argument.SetModelParamsPath(FLAGS_model_params_path);
+    }
+  } else {
+    argument.SetModelDir(FLAGS_model_dir);
+  }
+
+  // Only 1 pass, default filename is 0_ir_origin.dot
+  // For more details, looking for paddle::inference::analysis::IRPassManager
+  argument.SetIrAnalysisPasses({"graph_viz_pass"});
+
+  std::unique_ptr<paddle::framework::Scope> scope{
+      new paddle::framework::Scope()};
+  argument.SetScopeNotOwned(
+      const_cast<paddle::framework::Scope *>(scope.get()));
+
+  paddle::inference::utils::Visualizer visualizer;
+  visualizer.SetArgument(&argument);
+  visualizer.Run();
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/utils/visualizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..be532f92cf60e06094bfcf8cc2be85085795fcf4
--- /dev/null
+++ b/paddle/fluid/inference/utils/visualizer.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/argument.h"
+
+namespace paddle {
+namespace inference {
+namespace utils {
+
+using paddle::inference::analysis::Argument;
+
+class Visualizer final {
+ public:
+  Visualizer() = default;
+  ~Visualizer() = default;
+  Visualizer(const Visualizer &) = delete;
+  Visualizer &operator=(const Visualizer &) = delete;
+
+  void SetArgument(Argument *);
+  bool Run();
+
+ private:
+  Argument *argument_;
+};
+
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 64649b1a5e471a30f435e2b1c1a9db03d35dbd8a..e16b6f78d16ce29cc493c4c795c7fe97a4bf2550 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -100,8 +100,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   const T *x_data = x->data<T>();
   T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
-  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
-                 "Input dim must be with 2 or 4");
+  PADDLE_ENFORCE(
+      x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
+      "Input dim must be with 2, 3 or 4");
 
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9258d7c7e83122149c7cbc42e4a4bdd84903ce67
--- /dev/null
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/bpr_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BprLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
+
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of Seq-bpr
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class BprLossGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
+                      "Input(Label) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "The Input(X) and Input(Label) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
+    PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
+                      " the last dimension of Input(Label) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "real number.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. the last dimension "
+        "size is 1.");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the sequence bpr loss.");
+    AddComment(R"DOC(
+Bayesian Personalized Ranking Loss Operator.
+
+This operator belongs to pairwise ranking loss. Label is the desired item.
+The loss at a given point in one session is defined as:
+$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+
+Learn more details by reading paper <session-based recommendations with recurrent
+neural networks>(https://arxiv.org/abs/1511.06939)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPUCtx = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
+REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
+                       ops::BprLossOpKernel<CPUCtx, double>);
+REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
+                       ops::BprLossGradientOpKernel<CPUCtx, float>,
+                       ops::BprLossGradientOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e223be7af82146e7c69c7c5aab8f08d0fe0d1710
--- /dev/null
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+/*Todo:
+ *Find a way to adapt TolerableValue, using blas or eigen.
+ */
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BprLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+    int rank = x->dims().size();
+
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+
+    const framework::Tensor* logits = &x_2d;
+    const framework::Tensor* labels = &labels_2d;
+    framework::Tensor* out = &y_2d;
+
+    const int step_size = logits->dims()[0];
+    const int class_num = logits->dims()[1];
+    const T* logits_data = logits->data<T>();
+    T* loss_data = out->data<T>();
+
+    const int64_t* label_data = labels->data<int64_t>();
+    for (int i = 0; i < step_size; ++i) {
+      int lbl_pos = label_data[i];
+      PADDLE_ENFORCE_GE(lbl_pos, 0);
+      PADDLE_ENFORCE_LT(lbl_pos, class_num);
+      int index_pos = i * class_num + lbl_pos;
+      T sum = static_cast<T>(0);
+      for (int j = 0; j < class_num; j++) {
+        if (j == lbl_pos) continue;
+        int index_neg = i * class_num + j;
+        sum += TolerableValue<T>()(-std::log(
+            1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
+                                                logits_data[index_pos]))));
+      }
+      loss_data[i] = -sum / (class_num - 1);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BprLossGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const int step_size = x->dims()[0];
+    const int num_classes = x->dims()[1];
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* dy_data = dy->data<T>();
+    const T* x_data = x->data<T>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
+      for (size_t x_offset = sample_id * num_classes;
+           x_offset < (sample_id + 1) * num_classes; x_offset++) {
+        dx_data[x_offset] = static_cast<T>(0);
+      }
+      auto p_index = sample_id * num_classes + label_data[sample_id];
+      for (size_t ni = 0; ni < num_classes; ni++) {
+        if (label_data[sample_id] == ni) continue;
+        auto n_index = sample_id * num_classes + ni;
+        auto grad_ = -dy_data[sample_id] /
+                     ((num_classes - 1) *
+                      (1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
+                                                           x_data[n_index]))));
+        dx_data[p_index] += grad_;
+        dx_data[n_index] -= grad_;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 2c09ee7394ad605f7a324d021ce0468a79bb71ca..3235ad52b999e1ca3f992034781edaab9921a300 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
-    if (activation == "identity") {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    } else if (!exhaustive_search) {
+    if (!exhaustive_search) {
       CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    // ------------------- cudnn conv+bias+act forward --------------------
-    ScalingParamType<T> alpha1 = 1.0f;
-    ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+    if ((activation == "identity") &&
+        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
+        (!residual)) {
+      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
+      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
+      // But test in some case, the speed is slower, change to use
+      // cudnnConvolutionForward and cudnnAddTensor
+      // ------------- cudnn conv forward and bias add ---------------------
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
-    };
-    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    } else {
+      if (activation == "identity") {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      }
+      // ------------------- cudnn conv+bias+act forward --------------------
+      ScalingParamType<T> alpha1 = 1.0f;
+      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    }
   }
 };
 #endif
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index ce45dd58419ab20cccf00544288b79d869515578..154ff2bb209bb8f932c06caa319223ccf3314767 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -28,6 +28,46 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
+inline void GetWeightsTz(std::vector<int>& weights_tz, int groups,  // NOLINT
+                         bool is_conv3d) {
+  if (groups > 1) {
+    if (is_conv3d) {
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int dimension = weights_tz[2];
+      int height = weights_tz[3];
+      int width = weights_tz[4];
+      weights_tz.resize(6);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = dimension;
+      weights_tz[4] = height;
+      weights_tz[5] = width;
+    } else {
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int height = weights_tz[2];
+      int width = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = height;
+      weights_tz[4] = width;
+    }
+  }
+}
+
+inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
+                                               int groups, bool is_conv3d) {
+  if (is_conv3d) {
+    return (groups == 1) ? format : mkldnn::memory::format::goidhw;
+  } else {
+    return (groups == 1) ? format : mkldnn::memory::format::goihw;
+  }
+}
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
     if (bias) {
       PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
                          bias->format() != memory::format::format_undef,
@@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
     int groups = ctx.Attr<int>("groups");
 
+    bool is_conv3d = strides.size() == 3U;
     // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
-        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        is_conv3d
+            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
+                  dilations[2] == 1
+            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
 
     const T* input_data = input->data<T>();
@@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
     int g = std::max(groups, 1);
-    if (g > 1) {
-      int o = weights_tz[0];
-      int i = weights_tz[1];
-      int h = weights_tz[2];
-      int w = weights_tz[3];
-      weights_tz.resize(5);
-      weights_tz[0] = g;
-      weights_tz[1] = o / g;
-      weights_tz[2] = i;
-      weights_tz[3] = h;
-      weights_tz[4] = w;
-    }
+    GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     // Get unique name for storing MKLDNN primitives
@@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     std::vector<primitive> pipeline;
 
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
+
     auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
     auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(),
-        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
+
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                                // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
@@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const mkldnn::engine& engine, const bool fuse_relu,
                        const bool fuse_residual_conn,
                        mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims stride_dims = strides;
+    memory::dims padding_dims = paddings;
 
     auto conv_desc = mkldnn::convolution_forward::desc(
         fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
@@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const mkldnn::engine& engine, const bool fuse_relu,
                        const bool fuse_residual_conn,
                        mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims stride_dims = strides;
+    memory::dims padding_dims = paddings;
 
     auto conv_desc = mkldnn::convolution_forward::desc(
         fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
@@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
 
+    bool is_conv3d = strides.size() == 3U;
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
     const T* output_grad_data = output_grad->data<T>();
@@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
+
     // Get an unique name from "argument" name of "Output" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
@@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     // Create user memory descriptors
     auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
     auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
     auto user_diff_dst_md = platform::MKLDNNMemDesc(
         {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
 
@@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
+
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     auto diff_dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
@@ -500,3 +555,13 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
                                     ops::ConvMKLDNNGradOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 7455b9492f054b32ee7fb1fc90b1a344367ceb81..d7b876628855b8b76b340cd1e6115896ead4aa6c 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -134,14 +134,14 @@ void Conv2DOpMaker::Make() {
            "The format of output tensor is X (one-dimensional) of size equal"
            "to the number of output channels. Only used with MKL-DNN.")
       .AsDispensable();
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
   AddInput("ResidualData",
            "(Tensor) Tensor with residual data "
            "to which convolution output will be added."
            "Used with fuse_residual_connection fusion.")
       .AsDispensable();
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -232,6 +232,10 @@ $$
 }
 
 void Conv3DOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -247,6 +251,11 @@ void Conv3DOpMaker::Make() {
            "is the width of the filter."
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("ResidualData",
+           "(Tensor) Tensor with residual data "
+           "to which convolution output will be added."
+           "Used with fuse_residual_connection fusion.")
+      .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
             "The format of output tensor is also NCDHW.");
@@ -280,6 +289,13 @@ void Conv3DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_residual_connection",
+                "(bool, default false) Only used in mkldnn kernel. Used "
+                "whenever convolution output is as an input to residual "
+                "connection.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 36979de68f3abfdedfcc4a49cc312c1f849f5676..101dbe9c89616b7025337261469e2b1aa3e8bc76 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor
 
 if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows memory)
+      DEPS lod_tensor selected_rows_functor memory)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      selected_rows_functor  scope math_function SERIAL)
+  endif()
+
   cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
 else()
   set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d3f53431113621fc859eda8e7448383772d20a3
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_client.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <condition_variable>  // NOLINT
+#include <string>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/operators/distributed/collective_client.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+std::once_flag CollectiveClient::init_flag_;
+std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
+
+bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
+                              std::vector<const framework::SelectedRows*>* dst,
+                              const platform::DeviceContext& ctx,
+                              framework::Scope* scope, int64_t time_out) {
+  for (auto r : remote_vars) {
+    VLOG(50) << "begin gather from ep:" << r.String();
+    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
+    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
+        r.ep_, ctx, *scope, r.var_name_, time_out);
+  }
+
+  rpc_client_->Wait();
+
+  for (auto r : remote_vars) {
+    auto select_rows =
+        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
+    dst->push_back(select_rows);
+
+    VLOG(4) << "gather from ep:" << r.String()
+            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
+
+    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
+  }
+
+  rpc_client_->Wait();
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..53b03c531a2b8859e6d7c904e9ab4d1b7a5c8b9b
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
+  std::stringstream ss;
+  ss << ", height:" << slr.height() << ", rows:[";
+  for (unsigned int i = 0; i < slr.rows().size(); i++) {
+    if (i != slr.rows().size() - 1) {
+      ss << slr.rows()[i] << ",";
+    } else {
+      ss << slr.rows()[i];
+    }
+  }
+  ss << "], dims:" << slr.value().dims();
+  return ss.str();
+}
+
+struct RemoteVar {
+  std::string ep_;
+  std::string var_name_;
+  int trainer_id_{0};
+
+  std::string String() {
+    std::stringstream ss;
+    ss << "ep:" << ep_ << ", var_name:" << var_name_
+       << ", trainer_id:" << trainer_id_;
+
+    return ss.str();
+  }
+};
+
+class CollectiveClient {
+ public:
+  CollectiveClient() {
+    rpc_client_.reset(new RPCCLIENT_T());
+    rpc_client_->InitImpl();
+  }
+  virtual ~CollectiveClient() {}
+
+  // note this function will retain the rank order.
+  bool Gather(const std::vector<RemoteVar>& remote_vars,
+              std::vector<const framework::SelectedRows*>* dst,
+              const platform::DeviceContext& ctx, framework::Scope* scope,
+              int64_t time_out = FLAGS_rpc_deadline);
+
+  static CollectiveClient* GetInstance() {
+    std::call_once(init_flag_, [&]() {
+      if (client_.get() == nullptr) {
+        client_.reset(new CollectiveClient());
+      }
+    });
+    return client_.get();
+  }
+
+ private:
+  std::unique_ptr<RPCClient> rpc_client_;
+
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveClient> client_;
+};
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95652400c27acd406ca3f70a0dfa8d329e94358
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/collective_server.h"
+
+DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag CollectiveServer::init_flag_;
+std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
+
+CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
+  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
+  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
+}
+
+void CollectiveServer::Stop() {
+  rpc_server_->ShutDown();
+  server_thread_->join();
+  loop_thread_->join();
+}
+
+void CollectiveServer::StartServer() {
+  get_monomer_handler_.reset(new GetMonomerHandler());
+  get_monomer_handler_->SetRPCServer(rpc_server_.get());
+
+  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
+  get_barrier_handler_->SetRPCServer(rpc_server_.get());
+
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
+                           get_monomer_handler_.get(),
+                           FLAGS_collective_get_thread_num);
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
+                           get_barrier_handler_.get(), 1);
+
+  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
+  rpc_server_->WaitServerReady();
+
+  loop_thread_.reset(new std::thread([&]() {
+    while (true) {
+      if (rpc_server_->IsExit()) {
+        LOG(WARNING) << "get exit!rpc_processor break!";
+        break;
+      }
+      sleep(1);
+    }
+    VLOG(1) << "CollectiveServer loop_thread end";
+  }));
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..a23dc18b4de86421a0995b9951e0ae6f4dc76150
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class CollectiveServer;
+
+class GetMonomerHandler final : public RequestHandler {
+ public:
+  GetMonomerHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+
+    *outvar = scope->FindVar(var_name);
+    PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name);
+
+    return true;
+  }
+};
+
+class GetMonomerBarrierHandler final : public RequestHandler {
+ public:
+  GetMonomerBarrierHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerBarrierHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+
+    rpc_server_->IncreaseVarBarrier(var_name);
+
+    return true;
+  }
+};
+
+class CollectiveServer final {
+ public:
+  explicit CollectiveServer(const std::string& end_point, int fan_in);
+
+  virtual ~CollectiveServer() {}
+
+  void StartServer();
+
+  static CollectiveServer* GetInstance(const std::string& end_point,
+                                       int fan_in) {
+    std::call_once(init_flag_, [&]() {
+      if (collective_server_.get() == nullptr) {
+        collective_server_.reset(new CollectiveServer(end_point, fan_in));
+        collective_server_->StartServer();
+      }
+    });
+
+    return collective_server_.get();
+  }
+
+  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
+
+  void Stop();
+
+ private:
+  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
+  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
+
+  std::shared_ptr<distributed::RPCServer> rpc_server_;
+  std::shared_ptr<std::thread> server_thread_;
+  std::shared_ptr<std::thread> loop_thread_;
+
+  bool ready_{false};
+
+  static std::once_flag init_flag_;
+  static std::shared_ptr<CollectiveServer> collective_server_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a9c69e393257068371e88253b82a500f58ed837
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::operators::distributed;
+
+std::unique_ptr<distributed::CollectiveServer> StartServer(
+    const std::string& ep, int fan_in, framework::Scope* scope,
+    platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveServer* server =
+      distributed::CollectiveServer::GetInstance(ep, fan_in);
+
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
+                          scope, dev_ctx);
+
+  std::cout << "StartServer return" << std::endl;
+  return std::unique_ptr<distributed::CollectiveServer>(server);
+}
+
+std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
+
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+
+  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->mutable_data<float>(place);
+
+  paddle::operators::math::set_constant(ctx, tensor, 32.7);
+  for (int i = 0; i < 3; ++i) rows->push_back(i);
+
+  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
+
+  return std::unique_ptr<framework::Scope>(scope);
+}
+
+void Gather(const std::vector<distributed::RemoteVar>& vars,
+            platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveClient* client =
+      distributed::CollectiveClient::GetInstance();
+
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  var->GetMutable<framework::SelectedRows>();
+
+  std::vector<const framework::SelectedRows*> dst;
+  client->Gather(vars, &dst, *dev_ctx, scope);
+  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+}
+
+TEST(PREFETCH, GPU) {
+  platform::CUDAPlace place;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  std::string ep = "127.0.0.1:7164";
+  auto scope = GenerateVars(place);
+
+  auto* v1 = scope->FindVar("var1");
+  std::cout << "var1:" << v1 << std::endl;
+
+  auto server = StartServer(ep, 2, scope.get(), &ctx);
+  auto rpc_server = server->GetRPCServer();
+
+  distributed::RemoteVar var;
+  var.ep_ = ep;
+  var.var_name_ = "var1";
+  var.trainer_id_ = 0;
+
+  std::vector<distributed::RemoteVar> vars{var};
+  Gather(vars, &ctx);
+  Gather(vars, &ctx);
+
+  std::cout << "begin WaitVarBarrier" << std::endl;
+  rpc_server->WaitVarBarrier("var1");
+  rpc_server->ClearRegisteredVars();
+  server->Stop();
+
+  scope.release();
+  server.release();
+}
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index d7f3ea86aff9e7df3cd9ff3dca573a1ec6ccc27a..857214aa211aee0251571e46049c66c084b470f1 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -28,11 +28,11 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-void GRPCClient::InitImpl() { InitEventLoop(); }
-
-void GRPCClient::InitEventLoop() {
+void GRPCClient::InitImpl() {
   // start the client process thread
   // TODO(wuyi): can make this in a threadpool
+  PADDLE_ENFORCE(client_thread_ == nullptr,
+                 "please not re init proceed thread");
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
@@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(100) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetVariable", time_out);
+}
+
+VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
+}
+
+VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      const std::string& rpc_path,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
+  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
 
     platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    auto call =
+        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
@@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   return h;
 }
 
+VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  const std::string method = "SendMonomerFetchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+
+  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
+  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
+  return h;
+}
+
 VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
   const auto ch = GetChannel(ep);
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index a31a465645ee4256a76573576ea7fa5af7a5a101..01bf46cc313b4707c7af7a9605926a8b298d679d 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -189,6 +189,11 @@ class GRPCClient : public RPCClient {
                            const std::string& var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
@@ -200,8 +205,12 @@ class GRPCClient : public RPCClient {
   VarHandlePtr AsyncSendBatchBarrier(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out) override;
+
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dir,
@@ -214,21 +223,22 @@ class GRPCClient : public RPCClient {
 
   void SendComplete() override;
 
- protected:
   void InitImpl() override;
 
  private:
-  // InitEventLoop should only be called by Init()
-  void InitEventLoop();
-
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name, const std::string& rpc,
+                            int64_t time_out);
 
  private:
   grpc::CompletionQueue cq_;
   std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::unique_ptr<std::thread> client_thread_;
+  std::unique_ptr<std::thread> client_thread_{nullptr};
 
   // mutex for Wait client sync
   std::mutex sync_mutex_;
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index d9200c98b23601f8ffaa8eb7a7092a9cf881ca24..c3974138f4d4665c46bdfccaef09c0bd84b9d028 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -158,6 +158,98 @@ class RequestGet final : public RequestBase {
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
 };
 
+class RequestGetMonomerVariable final : public RequestBase {
+ public:
+  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
+                                     ::grpc::ServerCompletionQueue* cq,
+                                     RequestHandler* request_handler,
+                                     int req_id, RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetMonomerVariable() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
+
+class RequestGetMonomerBarrier final : public RequestBase {
+ public:
+  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
+                                    ::grpc::ServerCompletionQueue* cq,
+                                    RequestHandler* request_handler, int req_id,
+                                    RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetMonomerBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    VLOG(4) << "RequestGetMonomerBarrier " << varname;
+
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    framework::Scope* scope = nullptr;
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
+
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
@@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is waiting server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
   VLOG(4) << "AsyncGRPCServer WaitSeverReady";
@@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGetMonomerVariable) {
+    b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
+                                      this);
+  } else if (rpc_name == kRequestGetMonomerBarrier) {
+    b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id,
+                                     this);
   } else if (rpc_name == kRequestPrefetch) {
     b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestCheckpoint) {
@@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
 
   reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestSend status:" << b->Status();
+  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
 }
 
 void AsyncGRPCServer::HandleRequest(
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index 9ae9a31a003cbb1f808fd1127a5dd78511aa3e99..537429b5fe989269d437b6dfe558c0a7dcfc2dcc 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -81,10 +81,12 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetMonomerVariable,
+  kGetMonomerBarrier,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
+    static_cast<int>(GrpcMethod::kGetMonomerBarrier) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetMonomerVariable:
+      return "/sendrecv.SendRecvService/GetMonomerVariable";
+    case GrpcMethod::kGetMonomerBarrier:
+      return "/sendrecv.SendRecvService/GetMonomerBarrier";
     case GrpcMethod::kPrefetchVariable:
       return "/sendrecv.SendRecvService/PrefetchVariable";
     case GrpcMethod::kCheckpointNotify:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 5272afd42851160ca5352ef474d940a5d2dd2456..62b24f150b41efead24c8bdbe08c9b44e160445a 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -37,6 +37,8 @@ namespace distributed {
 
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
+constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 4cd3abb5a61068bc4f9f5b38cafc2daa8406d448..b668d869787a47ebd36f570061421ddbeae5a09a 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -45,6 +45,11 @@ class RPCClient {
                                    const std::string& var_name,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncPrefetchVar(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& in_var_name,
@@ -57,6 +62,10 @@ class RPCClient {
   virtual VarHandlePtr AsyncSendFetchBarrier(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dir,
       int64_t time_out = FLAGS_rpc_deadline) = 0;
@@ -87,8 +96,9 @@ class RPCClient {
     }
   }
 
- protected:
   virtual void InitImpl() {}
+
+ protected:
   // each trainer have exact one trainer id, it should be static
   static int trainer_id_;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 3e30ed4ac86bd2cb3f7c4301163e54a947c3d5b4..122619d41b25da488742b4a7192b6a18b8bf9283 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
       lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
 }
 
+void RPCServer::RegisterVar(const std::string& var_name,
+                            const std::string& rpc_name,
+                            framework::Scope* scope,
+                            platform::DeviceContext* dev_ctx) {
+  MonomerHandle h;
+  h.var_name_ = var_name;
+  h.rpc_name_ = rpc_name;
+  h.scope_ = scope;
+  h.dev_ctx_ = dev_ctx;
+
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
+    }
+    var_map_[var_name] = h;
+  }
+
+  rpc_cond_.notify_all();
+  VLOG(4) << "RegisterVar context:" << h.String();
+}
+
+void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
+  int b = 0;
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++var_map_[var_name].barrier_;
+    h = var_map_[var_name];
+  }
+
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+
+  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+}
+
+void RPCServer::WaitVarBarrier(const std::string& var_name) {
+  VLOG(4) << "WaitBarrier var_name:" << var_name;
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  barrier_cond_.wait(lock, [&]() {
+    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
+  });
+
+  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+}
+
+void RPCServer::SetVarCond(const std::string& var_name) {
+  VLOG(4) << "SetVarCond var_name:" << var_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      rpc_cond_.notify_all();
+    }
+  }
+}
+
+void RPCServer::WaitVarCond(const std::string& var_name) {
+  VLOG(4) << "WaitVarCond var_name:" << var_name;
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(lock, [=] {
+    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
+  });
+
+  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+}
+
+MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    h = var_map_[var_name];
+  }
+
+  return h;
+}
+
+void RPCServer::ClearRegisteredVars() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.clear();
+}
+
+void RPCServer::ClearVar(const std::string& var_name) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.erase(var_name);
+}
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index c78c5007a7f262f15305b6c284e8c4fbddef42a0..45d1d3479ce731894c26bbff40f456bbfdc13d44 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -21,12 +21,30 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+struct MonomerHandle {
+  std::string var_name_;
+  std::string rpc_name_;
+  framework::Scope* scope_{nullptr};
+  platform::DeviceContext* dev_ctx_{nullptr};
+  int64_t barrier_{0};
+
+  std::string String() {
+    std::stringstream ss;
+    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
+       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
+       << ", barrier_:" << barrier_;
+    return ss.str();
+  }
+};
+
 class RPCServer {
  public:
   explicit RPCServer(const std::string& address, int client_num)
@@ -67,6 +85,16 @@ class RPCServer {
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
 
+  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
+                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
+  void IncreaseVarBarrier(const std::string& var_name);
+  void WaitVarBarrier(const std::string& var_name);
+  void SetVarCond(const std::string& var_name);
+  void WaitVarCond(const std::string& var_name);
+  void ClearRegisteredVars();
+  void ClearVar(const std::string& var_name);
+  MonomerHandle GetMonomer(const std::string& var_name);
+
   void Complete();
 
   void ResetBarrierCounter();
@@ -95,6 +123,9 @@ class RPCServer {
   std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
   std::unordered_map<std::string, int> rpc_thread_num_;
   friend class RequestHandler;
+
+  // TODO(gongwb): use more cond to notify or wait;
+  std::unordered_map<std::string, MonomerHandle> var_map_;
 };
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 7b7d069f17fd0f9e6a776fa4d1a19cf01914cfeb..2637619f304d246fa535bbfc7be3474209b63b0f 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -28,6 +28,9 @@ service SendRecvService {
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+
+  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 0522a94195786c767194ec727d982a60451e7c62..9d1423915afc25889b9fa96963d6f9514bea2870 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
-                   "Cannot open file %s for load_combine op", filename);
-
+    auto model_from_memory = Attr<bool>("model_from_memory");
     auto out_var_names = Outputs("Out");
     PADDLE_ENFORCE_GT(
         static_cast<int>(out_var_names.size()), 0,
         "The number of output variables should be greater than 0.");
-
+    if (!model_from_memory) {
+      std::ifstream fin(filename);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+  void LoadParamsFromBuffer(
+      const framework::Scope &scope, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
-                     filename);
+      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
 
       // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
 
       auto in_dtype = framework::ToDataType(tensor->type());
       auto out_dtype =
@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          "LoDTensors will be loaded from \"file_path\".")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<bool>("model_from_memory",
+                  "(boolean, default false)"
+                  "If true, file_path is in memory, and LoDTensors will be "
+                  "loaded directly from memory")
+        .SetDefault(false);
     AddComment(R"DOC(
 LoadCombine Operator.
 
-LoadCombine operator loads LoDTensor variables from a file. The file should 
-contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator loads LoDTensor variables from a file, which could be 
+loaded in memory already. The file should contain one or more LoDTensors 
+serialized using the SaveCombine operator. The
 LoadCombine operator applies a deserialization strategy to appropriately load 
 the LodTensors, and this strategy complements the serialization strategy used 
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 31ed5196668954bc387423c34a0667622db71373..9e99e44822b2fce971b751967ca8076a1f1384ec 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index af64607fafc6544047714e731846a2440be219b8..1fef2b3d378c96d087118d0136885e7e29aa237c 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
       auto rows_idx = outs_rows_idx[i];
       outs[i]->set_height(height_sections[i]);
+      auto dims = x->GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+      outs[i]->mutable_rows()->clear();
       if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
         for (auto idx : rows_idx) {
           outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
         }
@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
           }
         }
       }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
     }
   }
 };
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d0a108f905f46135bcd2b68be19ab396ab897272..bd81d4dd1f1073edffcb9fd4a02b455db27361d5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -120,15 +120,24 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
     auto buf = paddle::memory::Alloc(place_, num_bytes,
                                      memory::Allocator::kScratchpad);
     void* retv = buf->ptr();
-    allocations_[buf->ptr()] = std::move(buf);
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
     return retv;
   }
 
   void deallocate(void* buffer) const override {
-    allocations_.erase(allocations_.find(buffer));
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
   }
 
   void* scratchpad() const override {
@@ -155,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   const cudaDeviceProp* device_prop_;  // not owned;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
   mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
 };
 
@@ -210,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
                           << (cudnn_dso_ver % 100) / 10 << ".";
+
+  {
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version = runtime_version_ / 100;
+    auto compile_cuda_version = CUDA_VERSION / 100;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place_.device
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+
+    if (dynload::HasCUDNN()) {
+      auto local_cudnn_version = cudnn_dso_ver / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cuda_version < compile_cuda_version) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+    }
+  }
+
   callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 167bd4e81d0ddbbba260417b460d083dbeb932b6..e53064893ee89f663a76483b92de32b318b6c61f 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
     return mkldnn::memory::format::x;
   } else if (dims_size == 2) {
     return mkldnn::memory::format::nc;
+  } else if (dims_size == 3) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::nwc;
+    }
+  } else if (dims_size == 5) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncdhw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::ndhwc;
+    }
   }
   return data_format;
 }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d602613fc82223e14f48830a87533880696eb550..b8954cb12628d1f4f333956e0213ddf9c01e592c 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,7 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
+
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34e9c897d9e95feb185083b7c0a6a824d8dc809c
--- /dev/null
+++ b/paddle/fluid/pybind/imperative.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace pybind {
+
+// Bind Methods
+void BindTracer(pybind11::module *m) {
+  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
+      .def("__init__",
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
+           })
+      .def("trace", &imperative::Tracer::Trace)
+      .def("get_scope", &imperative::Tracer::GetScope,
+           pybind11::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a9d3a01ea81f11ac85000c3d0153f20e108789a
--- /dev/null
+++ b/paddle/fluid/pybind/imperative.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+class PyLayer : public imperative::Layer {
+ public:
+  using imperative::Layer::Layer;  // Inherit constructors
+
+  std::vector<imperative::VarBase> Forward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
+                      inputs);  // NOLINT
+  }
+
+  void Backward() override {
+    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
+  }
+};
+
+class PyOpBase : public imperative::OpBase {
+ public:
+  using imperative::OpBase::OpBase;  // Inherit constructors
+};
+
+class PyVarBase : public imperative::VarBase {
+ public:
+  using imperative::VarBase::VarBase;  // Inherit constructors
+};
+
+void BindTracer(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index fc7991d2974c9262e6225de1537025944c1068c1..74b4f2e937b3d3715b13b03e8d3618c0afafb69c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -45,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) {
 
   BindException(&m);
 
+  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def("_run_backward",
+           [](imperative::VarBase &self, framework::Scope *scope) {
+             self.RunBackward(scope);
+           })
+      .def("_grad", &imperative::VarBase::Grad)
+      .def_property(
+          "desc",
+          [](const imperative::VarBase &self) { return self.var_desc_; },
+          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
+            self.var_desc_ = var_desc;
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def_property(
+          "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
+          [](imperative::OpBase &self, framework::OpDesc *op_desc) {
+            if (op_desc) {
+              self.op_desc_ = op_desc;
+            }
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  layer.def(py::init<>())
+      .def("forward",
+           [](imperative::Layer &self,
+              const std::vector<imperative::VarBase> &inputs) {
+             return self.Forward(inputs);
+           })
+      .def("backward", &imperative::Layer::Backward);
+  BindTracer(&m);
+
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -298,6 +336,8 @@ PYBIND11_MODULE(core, m) {
       .def("get_tensor",
            [](SelectedRows &self) { return self.mutable_value(); },
            py::return_value_policy::reference)
+      .def("numel",
+           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
       .def("set_height", &SelectedRows::set_height)
       .def("height", &SelectedRows::height)
       .def("set_rows",
@@ -601,6 +641,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
+  m.def("get_variable_tensor", framework::GetVariableTensor);
 
   m.def("_is_program_version_supported", IsProgramVersionSupported);
 
@@ -886,6 +927,18 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, int num_trainers) {
             self.num_trainers_ = num_trainers;
           })
+      .def_property(
+          "trainers_endpoints",
+          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
+          [](BuildStrategy &self,
+             const std::vector<std::string> &trainers_endpoints) {
+            self.trainers_endpoints_ = trainers_endpoints;
+          })
+      .def_property("trainer_id",
+                    [](const BuildStrategy &self) { return self.trainer_id_; },
+                    [](BuildStrategy &self, int trainer_id) {
+                      self.trainer_id_ = trainer_id;
+                    })
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 02a75236f6c7c7a64f2aa110ca7a7e3d92832fe9..f67f40f19f676054e0ab140f12770f01624fac02 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (int i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 19fc229e6fa84792f58aeeb00be09eb2401b19c7..57547f1867a937d16fb2dfc9b84e1a30759a527e 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -32,11 +32,28 @@ the image layout as follows.
 
 from __future__ import print_function
 
+import six
 import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
+# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here
+# https://github.com/numpy/numpy/issues/12497
+if six.PY3:
+    import subprocess
+    import sys
+    import_cv2_proc = subprocess.Popen(
+        [sys.executable, "-c", "import cv2"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE)
+    out, err = import_cv2_proc.communicate()
+    retcode = import_cv2_proc.poll()
+    if retcode != 0:
+        cv2 = None
+    else:
+        import cv2
+else:
+    try:
+        import cv2
+    except ImportError:
+        cv2 = None
 import os
 import tarfile
 import six.moves.cPickle as pickle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 2a53519188e7454b54424cfdd4a713ae37a2326b..52417a1eaf74658b4d87394404eafd5343a3c5fe 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,6 +34,7 @@ from . import io
 from . import evaluator
 from . import initializer
 from . import layers
+from . import imperative
 from . import contrib
 from . import nets
 from . import optimizer
@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'initializer',
         'layers',
         'contrib',
+        'imperative',
         'transpiler',
         'nets',
         'optimizer',
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 5102a558fd3fdfd89ad769cd3a10f5dc3ea78716..13d2893fd146b5a3d9100ee1ba6c2243cb9c411b 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -258,10 +258,13 @@ class DataFeeder(object):
         multiple mini-batches. Each mini-batch will be feed on each device.
 
         Args:
-            reader(fun): the input data.
-            multi_devices(bool): the number of places. Default None.
-            num_places(int): the number of places. Default None.
-            drop_last(bool): the number of places. Default None.
+            reader(function): the reader is the function which can generate data.
+            multi_devices(bool): whether to use multiple devices or not.
+            num_places(int): if the multi_devices is True, you can specify the number
+                of GPU to use, if 'num_places' is None, the function will use all the
+                GPU of the current machine. Default None.
+            drop_last(bool): whether to drop the last batch if the
+                size of the last batch is less than batch_size. Default True.
 
         Returns:
             dict: the result of conversion.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b156db53d2928daefed0959fc3e0731709855343..089792059465c60da43d02e8389f4e36900c2292 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import sys
 
 import numpy as np
 
@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
+_imperative_tracer_ = None
+
+
+def _in_imperative_mode():
+    return _imperative_tracer_ is not None
+
+
+def _imperative_tracer():
+    return _imperative_tracer_
+
 
 class NameScope(object):
     def __init__(self, name="", parent=None):
@@ -345,6 +356,21 @@ class Variable(object):
         self.op = None
         self.stop_gradient = stop_gradient
         self.is_data = is_data
+        if _in_imperative_mode():
+            self._ivar = core.VarBase()
+            self._ivar.desc = self.desc
+
+    def _numpy(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        tensor = core.get_variable_tensor(scope, self.desc.name())
+        return np.array(tensor)
+
+    def _backward(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        self._ivar._run_backward(scope)
+
+    def _gradient(self):
+        return np.array(self._ivar._grad())
 
     def __str__(self):
         return self.to_string(True)
@@ -655,6 +681,23 @@ class Operator(object):
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
+        if _in_imperative_mode():
+            self.iop = core.OpBase()
+            self.iop.desc = self.desc
+            self.inputs = []
+            if inputs is not None:
+                for inp in inputs.values():
+                    if isinstance(inp, Variable):
+                        self.inputs.append(inp)
+                    elif isinstance(inp, list) or isinstance(inp, tuple):
+                        self.inputs.extend(inp[:])
+            self.outputs = []
+            if outputs is not None:
+                for out in outputs.values():
+                    if isinstance(out, Variable):
+                        self.outputs.append(out)
+                    elif isinstance(out, list) or isinstance(out, tuple):
+                        self.outputs.extend(out[:])
 
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1041,19 +1084,15 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
-    def _var_recursive(self, name):
+    def _find_var_recursive(self, name):
         """
         Get a Variable by name from this block recursively.
 
         Args:
             name(str): the Variable's name.
 
-        Raises:
-            ValueError: this block and this parent block doesn't
-                have a Variable with the giving name.
-
         Returns:
-            Variable: the Variable with the giving name.
+            Variable: the Variable with the giving name. Or None if not found.
         """
         frontier = list()
         visited = set()
@@ -1079,8 +1118,27 @@ class Block(object):
                 frontier.append(prog.block(cur.forward_block_idx))
 
             visited.add(id(cur))
+        return None
 
-        raise ValueError("Var {0} is not found recursively".format(name))
+    def _var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
+        var = self._find_var_recursive(name)
+        if var:
+            return var
+        else:
+            raise ValueError("Var {0} is not found recursively".format(name))
 
     def all_parameters(self):
         return list(self.iter_parameters())
@@ -1206,6 +1264,9 @@ class Block(object):
         """
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+                                       [v._ivar for v in op.outputs], self.desc)
         self.ops.append(op)
         return op
 
@@ -1442,6 +1503,7 @@ class Program(object):
         self._is_chief = False
         self._slice_vars_and_attrs = []
         self._endpoints = []
+        self._trainers_endpoints = []
         self._distributed_lookup_table = None
 
     @property
@@ -2209,3 +2271,12 @@ def _get_var(name, program=None):
     assert isinstance(program, Program)
 
     return program.global_block().var(name)
+
+
+@contextlib.contextmanager
+def _imperative_guard(tracer):
+    global _imperative_tracer_
+    tmp_trace = _imperative_tracer_
+    _imperative_tracer_ = tracer
+    yield
+    _imperative_tracer_ = tmp_trace
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..922308b6b18b335535d41f24d544cde04991b794
--- /dev/null
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import base
+from .base import *
+
+from . import layers
+from .layers import *
+
+__all__ = []
+__all__ += layers.__all__
+__all__ += base.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d38ddb56c71ef7de67f79cf52cd26070f470cb
--- /dev/null
+++ b/python/paddle/fluid/imperative/base.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+
+__all__ = ['enabled', 'guard', 'to_variable']
+
+
+def enabled():
+    return framework._in_imperative_mode()
+
+
+@contextlib.contextmanager
+def guard():
+    train = framework.Program()
+    startup = framework.Program()
+    tracer = core.Tracer(train.current_block().desc)
+    with framework.program_guard(train, startup):
+        with framework.unique_name.guard():
+            with framework._imperative_guard(tracer):
+                yield
+
+
+def to_variable(value, block=None):
+    if isinstance(value, np.ndarray):
+        if not block:
+            block = framework.default_main_program().current_block()
+        py_var = framework.Variable(
+            block,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            name=None,
+            shape=value.shape,
+            dtype=value.dtype)
+        scope = framework._imperative_tracer().get_scope(block.desc)
+        var = scope.var(py_var.name)
+        tensor = var.get_tensor()
+        tensor.set(value, core.CPUPlace())
+        return py_var
+    elif isinstance(value, framework.Variable):
+        return value
+    else:
+        raise ValueError("Unsupported type %s" % type(value))
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a28f7f4ae35295394b560d79e3dc0cdd5f2beab
--- /dev/null
+++ b/python/paddle/fluid/imperative/layers.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import sys
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.imperative import base
+
+__all__ = ['PyLayer']
+
+
+class PyLayer(core.Layer):
+    def __init__(self):
+        pass
+
+    def __call__(self, inputs):
+        # TODO(panyx0718): Support declarative mode as well.
+        assert base.enabled()
+        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            inputs = [inputs]
+
+        var_inputs = []
+        for x in inputs:
+            py_var = base.to_variable(x)
+            var_inputs.append(py_var)
+        outputs = self.forward(var_inputs)
+        return outputs
+
+    def forward(self, inputs):
+        return []
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0782933c6c4851b410ee3fdf14d4f9d9e83d49cc..e74a87fc68db0e126098f7188db4a712dff2612d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -145,7 +145,7 @@ def save_vars(executor,
 
             prog = fluid.default_main_program()
             fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate = name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be saved.
             # And variables are going to be saved separately.
 
@@ -369,7 +369,7 @@ def load_vars(executor,
 
             prog = fluid.default_main_program()
             fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate=name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be loaded.
             # And all the variables are supposed to have been saved in differnet files.
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index dc317de9abbd06f4021e64b87ea88ba6af8809c9..74b4a977db6b69d4d256e1f7b36eb53524269bb1 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import copy
 import itertools
 import six
+import sys
+import numpy as np
 
 from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
+from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -46,23 +49,21 @@ class LayerHelper(object):
     def startup_program(self):
         return default_startup_program()
 
+    def to_variable(self, x):
+        return base.to_variable(x, self.main_program.current_block())
+
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
 
     def multiple_input(self, input_param_name='input'):
         inputs = self.kwargs.get(input_param_name, [])
-        type_error = TypeError(
-            "Input of {0} layer should be Variable or sequence of Variable".
-            format(self.layer_type))
-        if isinstance(inputs, Variable):
-            inputs = [inputs]
-        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            raise type_error
+        ret = []
+        if isinstance(inputs, list) or isinstance(inputs, tuple):
+            for inp in inputs:
+                ret.append(self.to_variable(inp))
         else:
-            for each in inputs:
-                if not isinstance(each, Variable):
-                    raise type_error
-        return inputs
+            ret.append(self.to_variable(inputs))
+        return ret
 
     def input(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 05138bf94598f649ef7fdbaa94896b6ba0884416..b7e39685691809d04ecddc21d2d04a7a85e478d5 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -717,8 +717,9 @@ class While(object):
 
         out_vars = []
         for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
+            inner_var = parent_block._find_var_recursive(inner_out_name)
+            if inner_var:
+                out_vars.append(inner_var)
 
         step_scope = parent_block.create_var(
             type=core.VarDesc.VarType.STEP_SCOPES)
@@ -1264,10 +1265,11 @@ class ConditionalBlock(object):
             if each_name not in input_set
         ]
 
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
+        out_list = []
+        for inner_out_name in intermediate:
+            inner_var = parent_block._find_var_recursive(inner_out_name)
+            if inner_var:
+                out_list.append(inner_var)
 
         step_scope = parent_block.create_var(
             type=core.VarDesc.VarType.STEP_SCOPES)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4833212d311e4792dd14709cf3e5843e297e810d..e25eaaa9fda6add9d8e81d9e6bdfb711cee3648e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -41,6 +41,7 @@ __all__ = [
     'crf_decoding',
     'cos_sim',
     'cross_entropy',
+    'bpr_loss',
     'square_error_cost',
     'chunk_eval',
     'sequence_conv',
@@ -1348,6 +1349,44 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     return out
 
 
+def bpr_loss(input, label, name=None):
+    """
+    Bayesian Personalized Ranking Loss Operator.
+
+    This operator belongs to pairwise ranking loss. Label is the desired item.
+    The loss at a given point in one session is defined as:
+    $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+
+    Learn more details by reading paper <session-based recommendations with recurrent
+    neural networks>(https://arxiv.org/abs/1511.06939)
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+                                batch size and D is the number of classes.
+                                This input is not probability but logits.
+        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
+                                is a tensor<int64> with shape [N x 1].
+        name (str|None):        A name for this layer(optional). If set None, the
+                                layer will be named automatically. Default: None.
+    Returns:
+        A 2-D tensor with shape [N x 1], the bpr loss.
+
+    Examples:
+        .. code-block:: python
+
+          cost = fluid.layers.bpr_loss(input=predict, label=label)
+    """
+
+    helper = LayerHelper('bpr_loss', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bpr_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]})
+    return out
+
+
 def square_error_cost(input, label):
     """
     **Square error cost layer**
@@ -6623,7 +6662,8 @@ def relu(x, name=None):
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
     return out
 
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index dc27a8eabb5e3671250958b67ea7212ad9bdf08b..c54c3963a152851f5396c2ba71c28cc09c1cd523 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -135,9 +135,17 @@ class ParallelExecutor(object):
             build_strategy = BuildStrategy()
 
         build_strategy.num_trainers = num_trainers
+        build_strategy.trainer_id = trainer_id
 
         main = main_program
         main = main if main else framework.default_main_program()
+
+        trainers_endpoints = main._trainers_endpoints
+        if num_trainers > 1 and trainers_endpoints:
+            assert num_trainers == len(
+                trainers_endpoints), "num_trainers == len(end_points)"
+            build_strategy.trainers_endpoints = trainers_endpoints
+
         if scope == None:
             scope = executor.global_scope()
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 61cfdb80af04ede49621bc680fa0fa733a21a2a3..a4089ba3ca08bed5702a66ed370da52ecd9b58c6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,14 +43,13 @@ if(APPLE)
         list(REMOVE_ITEM TEST_OPS test_desc_clone)
         list(REMOVE_ITEM TEST_OPS test_program_code)
     endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
+    message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
     # this op is not support on mac
     list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
     # TODO: add the unitest back when it fixed
     list(REMOVE_ITEM TEST_OPS test_detection_map_op)
     list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
     list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
-    list(REMOVE_ITEM TEST_OPS test_gradient_clip)
 endif()
 if(NOT WITH_MKLML)
     # this op is not support on openblas
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index cf62817956c12cd4487eba88bf49ed43331dff03..faec5350424668fca6416e91c3e58174bd4ec877 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-        if args.is_dist:
+        if args.update_method == "pserver":
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         def get_data():
             origin_batch = next(reader_generator)
-            if args.is_dist and args.use_reader_alloc:
+            if args.update_method == "pserver" and args.use_reader_alloc:
                 new_batch = []
                 for offset, item in enumerate(origin_batch):
                     if offset % 2 == args.trainer_id:
diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc5fbd237d17f2d4e45b06e5806fff5cbf58fe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+
+
+class TestBprLossOp1(OpTest):
+    """Test BprLoss with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "bpr_loss"
+        batch_size = 40
+        class_num = 5
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        bpr_loss_result = []
+        for i in range(batch_size):
+            sum = 0.0
+            for j in range(class_num):
+                if j == label[i][0]:
+                    continue
+                sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]])))
+            bpr_loss_result.append(-sum / (class_num - 1))
+        bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": bpr_loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index 9f3f2f348166864be9583855fcd1949fd4ac818c..6cd71e39e41dae5d07e5761fc9caeca113f3b47e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
         self.activation = 'identity'
 
 
+class TestIdentityActivation(TestConv2dFusionOp):
+    def init_activation(self):
+        self.activation = 'identity'
+        self.add_residual_data = False
+
+
 class TestWithGroup(TestConv2dFusionOp):
     def init_group(self):
         self.groups = 3
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e1265e142b800587599783367eca2203033bf1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
+
+
+class TestMKLDNN(TestConv3dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNCase1(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNGroup1(TestWithGroup1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNGroup2(TestWithGroup2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 69c5ab7a4a4cbd552d27dcb07052d46752eeb54a..c6b749fe09b18b1d704f45a5a5b3adbd5c6a6d0b 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -74,6 +74,8 @@ class TestConv3dOp(OpTest):
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
+        self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -83,8 +85,7 @@ class TestConv3dOp(OpTest):
         conv3d_param = {
             'stride': self.stride,
             'pad': self.pad,
-            'dilations': self.dilations,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'dilations': self.dilations
         }
 
         input = np.random.random(self.input_size).astype(self.dtype)
@@ -101,7 +102,9 @@ class TestConv3dOp(OpTest):
             'paddings': self.pad,
             'groups': self.groups,
             'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -109,59 +112,35 @@ class TestConv3dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.03)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 160969c63fcbb1b552df195795017d9677c4fe3a..26fa20291b52e469066e23b5c29a8e11b40a1270 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -76,12 +76,24 @@ class TestDistRunnerBase(object):
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-        if args.is_dist:
+        if args.update_method == "pserver":
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
+        elif args.update_method == "nccl2":
+            # transpile for nccl2
+            config = fluid.DistributeTranspilerConfig()
+            config.mode = "nccl2"
+            nccl2_t = fluid.DistributeTranspiler(config=config)
+            nccl2_t.transpile(
+                args.trainer_id,
+                program=fluid.default_main_program(),
+                startup_program=fluid.default_startup_program(),
+                trainers=args.endpoints,
+                current_endpoint=args.current_endpoint)
+            trainer_prog = fluid.default_main_program()
         else:
             trainer_prog = fluid.default_main_program()
 
@@ -110,11 +122,20 @@ class TestDistRunnerBase(object):
                 len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
             mypass.set_int("num_repeats", args.batch_merge_repeat)
 
+        if args.update_method == "nccl2":
+            num_trainers = len(args.endpoints.split(","))
+            trainer_id = args.trainer_id
+        else:
+            num_trainers = 1
+            trainer_id = 0
+
         exe = fluid.ParallelExecutor(
             args.use_cuda,
             loss_name=avg_cost.name,
             exec_strategy=strategy,
-            build_strategy=build_stra)
+            build_strategy=build_stra,
+            num_trainers=num_trainers,
+            trainer_id=trainer_id)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -126,7 +147,7 @@ class TestDistRunnerBase(object):
 
         def get_data():
             origin_batch = next(reader_generator)
-            if args.is_dist and args.use_reader_alloc:
+            if args.update_method != "local" and args.use_reader_alloc:
                 new_batch = []
                 for offset, item in enumerate(origin_batch):
                     if offset % 2 == args.trainer_id:
@@ -151,7 +172,11 @@ def runtime_main(test_class):
     parser.add_argument(
         '--role', type=str, required=True, choices=['pserver', 'trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument('--is_dist', action='store_true')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default="local",
+        choices=["pserver", "nccl2", "local"])
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument(
@@ -170,7 +195,7 @@ def runtime_main(test_class):
     args = parser.parse_args()
 
     model = test_class()
-    if args.role == "pserver" and args.is_dist:
+    if args.role == "pserver" and args.update_method == "pserver":
         model.run_pserver(args)
     else:
         model.run_trainer(args)
@@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase):
         self._use_reduce = False
         self._dc_asgd = False  # must use with async mode
         self._use_reader_alloc = True
+        self._nccl2_mode = False
         self._setup_config()
         self._after_setup_config()
 
@@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase):
 
     def start_pserver(self, model_file, check_error_log, required_envs):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
+        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
         ps0_cmd = ps_cmd % \
                   (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
                    self._trainers)
@@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase):
         else:
             env_local = {'CPU_NUM': '1'}
 
-        envs.update(env_local)
+        env_local.update(envs)
+        print("local_cmd: {}, env: {}".format(cmd, env_local))
 
         if check_error_log:
             err_log = open("/tmp/trainer.err.log", "wb")
@@ -278,13 +305,13 @@ class TestDistBase(unittest.TestCase):
                 cmd.split(" "),
                 stdout=subprocess.PIPE,
                 stderr=err_log,
-                env=envs)
+                env=env_local)
         else:
             local_proc = subprocess.Popen(
                 cmd.split(" "),
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
-                env=envs)
+                env=env_local)
 
         local_out, local_err = local_proc.communicate()
 
@@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase):
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
-        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver"
         tr0_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
                    0, ps0_ep, self._trainers)
@@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase):
         env0.update(envs)
         env1.update(envs)
 
-        print("tr0_cmd:{}".format(tr0_cmd))
-        print("tr1_cmd:{}".format(tr1_cmd))
+        print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
+        print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
@@ -351,28 +378,111 @@ class TestDistBase(unittest.TestCase):
             stderr=tr1_pipe,
             env=env1)
 
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0_proc.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+        while True:
+            stat1 = tr1_proc.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
 
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
-
         ps0_pipe.close()
         ps1_pipe.close()
-        # FIXME: use terminate() instead of sigkill.
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
+
         ps0.terminate()
         ps1.terminate()
 
+        # print server log
+        with open("/tmp/ps0_err.log", "r") as fn:
+            sys.stderr.write("ps0 stderr: %s\n" % fn.read())
+        with open("/tmp/ps1_err.log", "r") as fn:
+            sys.stderr.write("ps1 stderr: %s\n" % fn.read())
+
+        # print log
+        if stat0 == 0:
+            sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
+        with open("/tmp/tr0_err.log", "r") as fn:
+            sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
+        if stat1 == 0:
+            sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
+        with open("/tmp/tr1_err.log", "r") as fn:
+            sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
+
+        return pickle.loads(tr0_out), pickle.loads(tr1_out)
+
+    def _run_cluster_nccl2(self, model, envs, check_error_log):
+        # NOTE: we reuse ps_endpoints as nccl2 worker endpoints
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2"
+        tr0_cmd = tr_cmd % \
+                  (self._python_interp, model, self._ps_endpoints,
+                   0, w0_ep)
+        tr1_cmd = tr_cmd % \
+                  (self._python_interp, model, self._ps_endpoints,
+                   1, w1_ep)
+
+        if self._mem_opt:
+            tr0_cmd += " --mem_opt"
+            tr1_cmd += " --mem_opt"
+        if self._use_reduce:
+            tr0_cmd += " --use_reduce"
+            tr1_cmd += " --use_reduce"
+        if self._use_reader_alloc:
+            tr0_cmd += " --use_reader_alloc"
+            tr1_cmd += " --use_reader_alloc"
+        if self.__use_cuda:
+            tr0_cmd += " --use_cuda"
+            tr1_cmd += " --use_cuda"
+            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        else:
+            env0 = {'CPU_NUM': '1'}
+            env1 = {'CPU_NUM': '1'}
+
+        env0.update(envs)
+        env1.update(envs)
+
+        print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0))
+        print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1))
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+
         # print log
-        sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
         sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
         sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
+        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
 
-        # return tr0_losses, tr1_losses
         return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
     def check_with_place(self,
@@ -386,21 +496,27 @@ class TestDistBase(unittest.TestCase):
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "FLAGS_cudnn_deterministic": "1",
-            "http_proxy": ""
+            "http_proxy": "",
+            "NCCL_P2P_DISABLE": "1"
         }
 
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         local_losses\
             = self._run_local(model_file, required_envs,
                                        check_error_log)
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
-                                                   check_error_log)
+        if self._nccl2_mode:
+            tr0_losses, tr1_losses = self._run_cluster_nccl2(
+                model_file, required_envs, check_error_log)
+        else:
+            tr0_losses, tr1_losses = self._run_cluster(
+                model_file, required_envs, check_error_log)
 
         for step_id in range(RUN_STEP):
             local_loss = local_losses[step_id]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 81eb651878209164b3f339cc5030dbac847942d1..630bed198f4fc382d716373ea872e24b1b45bbf3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase):
         self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnistNCCL2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1)
+
+
 class TestDistMnist2x2Lars(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index ea2b554dac83988955e3a7e8919e57a4ed7a8215..4588ca7c17ba5db893f080813d299feaa47626a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         model_dir = tempfile.mkdtemp()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 194387bc98752e66acd2c08a4abcaddfc34ad155..d9ad4e2e2c7b8d0a99d917495fbc8efc6cbd188d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest):
 
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
+            config.wait_port = False
             t = fluid.DistributeTranspiler(config=config)
             t.transpile(
                 0,
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index e4b3168ba6636253055f546fb3eec8a536714209..e49239da6d3918211fbbc302d2c56818460b6d51 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -20,9 +20,6 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 
-BATCH_SIZE = 128
-CLIP = 1
-
 
 def bow_net(data,
             label,
@@ -64,6 +61,8 @@ class TestGradientClip(unittest.TestCase):
         return places
 
     def check_operators(self, place):
+        CLIP = 1
+
         prog = fluid.framework.Program()
         startup_program = fluid.framework.Program()
         with fluid.program_guard(
@@ -79,13 +78,13 @@ class TestGradientClip(unittest.TestCase):
             avg_cost = fluid.layers.mean(cost)
 
         prog_clip = prog.clone()
-
         avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
 
         p_g = fluid.backward.append_backward(loss=avg_cost)
         p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
 
-        with fluid.program_guard(main_program=prog_clip):
+        with fluid.program_guard(
+                main_program=prog_clip, startup_program=startup_program):
             fluid.clip.set_gradient_clip(
                 fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
             p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
@@ -96,7 +95,7 @@ class TestGradientClip(unittest.TestCase):
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=BATCH_SIZE)
+            batch_size=128)
 
         exe = fluid.Executor(place)
         feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
@@ -112,12 +111,12 @@ class TestGradientClip(unittest.TestCase):
                                feed=feeder.feed(data),
                                fetch_list=grad_clip_list)
             global_norm = 0
-            for v in out[1:]:
+            for v in out:
                 global_norm += np.sum(np.power(v, 2))
             global_norm = np.sqrt(global_norm)
 
             global_norm_clip = 0
-            for v in out_clip[1:]:
+            for v in out_clip:
                 global_norm_clip += np.sum(np.power(v, 2))
             global_norm_clip = np.sqrt(global_norm_clip)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b6305155d1ef3dcf6ce590c221664754c5bdc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+class MyLayer(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+
+    def forward(self, inputs):
+        x = fluid.layers.relu(inputs[0])
+        self._x_for_debug = x
+        return [fluid.layers.elementwise_mul(x, x)]
+
+
+class TestImperative(unittest.TestCase):
+    def test_layer(self):
+        with fluid.imperative.guard():
+            cl = core.Layer()
+            cl.forward([])
+            l = fluid.imperative.PyLayer()
+            l.forward([])
+
+    def test_layer_in_out(self):
+        with fluid.imperative.guard():
+            l = MyLayer()
+            x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
+            self.assertIsNotNone(x)
+            sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
+            x._backward()
+            sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index be51fb06a37a376f6f410336184c95981ded35dc..10e8bb5a86691d8654c5ae48794e49f30f47500d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -846,6 +846,15 @@ class TestBook(unittest.TestCase):
             out = layers.cross_entropy(x, label, False, 4)
             self.assertIsNotNone(out)
 
+    def test_bpr_loss(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[30, 10], dtype="float32")
+            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+            out = layers.bpr_loss(x, label)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_expand(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index 275e5c49d5c298a95b012582a74f8073b800991e..fa16f082880eb97f54abe8bf75e26321f72b3bd3 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.transpiler import memory_optimize
 
 
+def _get_vars(prog):
+    assert (isinstance(prog, Program))
+    all_vars = set()
+    for op in prog.global_block().ops:
+        all_vars.update(op.input_arg_names)
+        all_vars.update(op.output_arg_names)
+    return all_vars
+
+
 class TestControlFlowGraph(unittest.TestCase):
     def setUp(self):
         program = Program()
@@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase):
         self.program = program
 
     def test_control_flow_graph(self):
-        print("before optimization")
-        print(str(self.program))
-        result_program = memory_optimize(self.program)
-        print("after optimization")
-        print(str(result_program))
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
 
 
 class TestMemoryTranspiler2(unittest.TestCase):
@@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase):
             avg_cost = layers.mean(cost)
             opt = optimizer.SGD(learning_rate=0.001)
             opt.minimize(avg_cost)
+        self.skip_set = set([cost.name, fc.name])
         self.program = program
 
     def test_inplace_ops(self):
-        print("before optimization")
-        print(str(self.program))
-        result_program = memory_optimize(self.program)
-        print("after optimization")
-        print(str(result_program))
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
+
+    def test_skip_opt(self):
+        result_program = self.program.clone()
+        memory_optimize(self.program, skip_opt_set=self.skip_set)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
 
 
 class TestMemoryTranspiler3(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 50204b8a77c187aa695da83860960566448d290f..f8847e1570dc47d432777faa15f4004f1a7111a6 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
         # expected output selected rows
         expected_out0_rows = [0, 4]
         expected_out1_rows = [0, 2]
+        expected_out2_rows = []
         expected_out4_rows = [0]
 
         op = Operator(
@@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
 
         self.assertEqual(outs[0].rows(), expected_out0_rows)
         self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[2].rows(), expected_out2_rows)
         self.assertEqual(outs[4].rows(), expected_out4_rows)
 
         self.assertEqual(outs[0].height(), height_sections[0])
@@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase):
         self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
         self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
 
+        self.assertEqual(outs[2].numel(), 0)
+        self.assertEqual(outs[3].numel(), 0)
+
     def check_grad_with_place(self, place):
         scope = core.Scope()
         height = 10
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 1d867d9194347cf55fd1bd8b1962856d599be7ec..d21ec42dccde80fd354a730274edb04f654113c3 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -142,6 +142,7 @@ class DistributeTranspilerConfig(object):
     # supported modes: pserver, nccl2
     mode = "pserver"
     print_log = False
+    wait_port = True
 
 
 class DistributeTranspiler(object):
@@ -171,7 +172,6 @@ class DistributeTranspiler(object):
             trainer_id = 0
             trainers = 4
             role = os.getenv("PADDLE_TRAINING_ROLE")
-	     
             t = fluid.DistributeTranspiler()
             t.transpile(
                  trainer_id, pservers=pserver_endpoints, trainers=trainers)
@@ -214,13 +214,16 @@ class DistributeTranspiler(object):
                          trainer_id,
                          trainers,
                          current_endpoint,
-                         startup_program=None):
+                         startup_program=None,
+                         wait_port=True):
         if not startup_program:
             startup_program = default_startup_program()
         if trainer_id >= 0:
             worker_endpoints = trainers.split(",")
             # send NCCL_ID to others or recv from trainer 0
             worker_endpoints.remove(current_endpoint)
+            if trainer_id == 0 and wait_port:
+                wait_server_ready(worker_endpoints)
 
             nccl_id_var = startup_program.global_block().create_var(
                 name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
@@ -302,11 +305,13 @@ class DistributeTranspiler(object):
 
         if self.config.mode == "nccl2":
             assert (isinstance(trainers, str))
+            self.origin_program._trainers_endpoints = trainers.split(",")
             self._transpile_nccl2(
                 trainer_id,
                 trainers,
                 current_endpoint,
-                startup_program=startup_program)
+                startup_program=startup_program,
+                wait_port=self.config.wait_port)
             return
 
         self.trainer_num = trainers
@@ -652,9 +657,6 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
-        sys.stderr.write("get_pserver_program() is deprecated, call \
-get_pserver_programs() to get pserver main and startup \
-in a single call.")
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
@@ -922,18 +924,6 @@ in a single call.")
         Returns:
             Program: parameter server side startup program.
         """
-        sys.stderr.write("get_startup_program() is deprecated, call \
-get_pserver_programs() to get pserver main and startup \
-in a single call.")
-        if pserver_program != None:
-            sys.stderr.write("passing pserver_program to get_startup_program() \
-is deprecated, you can use new API get_pserver_programs() to \
-get both pserver main program and startup program.")
-        if startup_program != None:
-            sys.stderr.write("passing startup_program to get_startup_program() \
-is deprecated, use fluid.program_guard() or pass this argument \
-to transpile() call.")
-
         s_prog = Program()
         orig_s_prog = self.startup_program
         s_prog.random_seed = orig_s_prog.random_seed
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index c9f1be934773cc28f026f2b867b9e3a4f7aa8472..95aafec05361a8b66b849268c7a738bb2ee5da86 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import six
 from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
@@ -470,8 +471,21 @@ def memory_optimize(input_program,
     Returns:
         None
     """
+
+    def to_name_str(var):
+        if isinstance(var, Variable):
+            return var.desc.name()
+        elif isinstance(var, str):
+            return var
+        elif isinstance(var, six.string_types):
+            return str(var)
+        else:
+            raise TypeError(str(var) + " should be Variable or str")
+
     if level != 0 and level != 1:
         raise ValueError("only support opt_level 0 or 1.")
+    if skip_opt_set is not None and not isinstance(skip_opt_set, set):
+        raise ValueError("only support skip_opt_set as set.")
     global PRINT_LOG
     PRINT_LOG = print_log
     if skip_grads:
@@ -486,6 +500,8 @@ def memory_optimize(input_program,
             skip_opt_set = grad_set
         else:
             skip_opt_set.update(grad_set)
+    if skip_opt_set is not None:
+        skip_opt_set = set(map(to_name_str, skip_opt_set))
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
diff --git a/python/setup.py.in b/python/setup.py.in
index 5aee26b63832889272cde09c553b4615efb8872a..0eb69cdb5c7d140527dba7a648728750bfb404f7 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -101,6 +101,7 @@ packages=['paddle',
           'paddle.dataset',
           'paddle.reader',
           'paddle.fluid',
+          'paddle.fluid.imperative',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 5c5266f904f5dcf74dd1d4ee7e98081f74a79907..7e61dde0a446cf5bfe656105ffd2472f03576f05 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,6 +27,8 @@ import pydoc
 
 member_dict = collections.OrderedDict()
 
+experimental_namespace = {"paddle.fluid.imperative"}
+
 
 def visit_member(parent_name, member):
     cur_name = ".".join([parent_name, member.__name__])
@@ -51,6 +53,8 @@ def visit_member(parent_name, member):
 
 
 def visit_all_module(mod):
+    if (mod.__name__ in experimental_namespace):
+        return
     for member_name in (
             name
             for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))