diff --git a/AUTHORS.md b/AUTHORS.md
index deafa641203ed9d9bd794fe92e4a91e3aaa03f63..da91933f4697f3b7c08feab20d703e22397b3757 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -44,6 +44,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Sand3r- | Michal Gallus |
+| sfraczek | Sylwester Fraczek |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
@@ -54,6 +55,7 @@
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
 | wen-bo-yang | Wen-Bo Yang |
+| wojtuss | Wojciech Uss |
 | wwhu | Wei-Wei Hu |
 | xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
diff --git a/README.md b/README.md
index 68421cf177f4cd15f8f44e8d00a27cafb5a13b91..5c428e9900762a208eebbfd053ce98663f803345 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -18,7 +18,7 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -26,9 +26,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
 
    We appreciate your contributions!
 
diff --git a/README_cn.md b/README_cn.md
index dfb55b17ca4fd05ce5b7b85b2e26e4f7f7229763..b7b0e75e5524cc483a8c203a382e7f339f91694f 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
-### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
@@ -24,9 +24,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 
 ## 安装
 
-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
 
 ## 文档
 
-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
 
 - [深度学习101](https://github.com/PaddlePaddle/book)
 
   或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
 
-- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
 
   可以在MPI集群上运行分布式训练任务
 
-- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
 
    新的API支持代码更少更简洁的程序
 
-- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    欢迎您的贡献!
 
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f24cf96cce30bf4b21e954ab022b2b99aeff37e6..1c3192b0193b8ba2452e959c983b23df458a652a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
 paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
 paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
 paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
 paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
+paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
 paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0))
 paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index f537e4b9e569dd4c513ac0efde7240833bcf04b6..f4bb2f3e2fc2c8cf0376631d1996b395a8bc581a 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
   return res;
 }
 
+void BlockDesc::Clear() {
+  // clear all ops
+  ops_.clear();
+
+  // clear all vars which are not persistable
+  for (auto it = vars_.begin(); it != vars_.end();) {
+    if (it->second->Persistable()) {
+      ++it;
+    } else {
+      vars_.erase(it++);
+    }
+  }
+}
+
 void BlockDesc::Flush() {
   for (auto &op_desc : ops_) {
     op_desc->Flush();
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 960ca39e1eadd3c064beb0e2c1342a406c4f0b6a..e192624a261e1291f1610e8e7e700d99a9d814d2 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -97,6 +97,8 @@ class BlockDesc {
 
   std::vector<OpDesc *> AllOps() const;
 
+  void Clear();
+
   size_t OpSize() const { return ops_.size(); }
 
   OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 72c50518af08b9c1b2f97e6864e5836e806c77fc..10aa7a59422f4508dda8d0bcd960583056e25938 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   out_layout =
       out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
 
-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
-      pool.Get(expected_kernel_type.place_));
-  auto& cpu_engine = dev_ctx->GetEngine();
-
   std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
   std::vector<int> out_tz = in_tz;
 
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: %s", in.type());
   memory::data_type out_type = in_type;
 
-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
-  auto out_format =
-      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  if (in_format != out_format) {
+  // tempory mem pd fr out , to make reorder
+  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+      paddle::framework::vectorize2int(out->dims()),
+      mkldnn::memory::format::blocked, out_type);
+  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
     void* in_data = GetDataFromTensor(in, in_type);
     auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-    auto in_memory =
-        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-    auto out_memory =
-        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
+    auto out_memory = memory(out_mem_pd, out_data);
 
     platform::Reorder(in_memory, out_memory);
   } else {
     out->ShareDataWith(in);
   }
   out->set_layout(out_layout);
-  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(memory::format::format_undef);
 #endif
 }
 
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 82872224501709080ff02a13464d58543a0abda8..f0203edf05635452bf347335066dadc24ecc3138 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
-
-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-                                                        ToMKLDNNFormat(lin));
-
         out.ShareDataWith(input_tensor);
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
+        // TODO(jczaja): Remove that once all mkldnn ops
+        // are modified to work with mkldnn_blocked
+        auto mkldnn_fmt = [&](int rank) {
+          switch (rank) {
+            case 5:
+              return mkldnn::memory::format::ncdhw;
+            case 4:
+              return mkldnn::memory::format::nchw;
+            case 3:
+              return mkldnn::memory::format::ncw;
+            case 2:
+              return mkldnn::memory::format::nc;
+            case 1:
+              return mkldnn::memory::format::x;
+            default:
+              return mkldnn::memory::format::blocked;
+          }
+        };
+
+        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+            paddle::framework::vectorize2int(out.dims()),
+            mkldnn_fmt(out.dims().size()));
+
+        out.set_mkldnn_prim_desc(out_mem_pd);
 #endif
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index 2e20c436dfdb61fcda78cd044b86848c750cf22c..ff223e616f7ef0c794e72a0028c7e5bb3f234ec0 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
   std::unordered_map<std::string, int> vars;
   // TODO(gongwb): use graph topology sort to find the order of operators.
   //               Note that must assert topology sort is stable
-  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
   for (auto* op_desc : ops) {
     auto outputs = op_desc->Outputs();
     for (auto& o_it : outputs) {
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
 
 REGISTER_PASS(all_reduce_deps_pass,
               paddle::framework::details::AllReduceDepsPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index e5c108f89044253eb54413742679e20cfd0d9f5a..4c5384af613752251fd8336874d86474ffb9515b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -136,14 +136,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     ir::Pass *multi_devices_pass;
 
     if (strategy_.is_distribution_) {
+      VLOG(3) << "multi device parameter server mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else if (strategy_.async_mode_) {
       multi_devices_pass = AppendPass("async_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        VLOG(3) << "multi devices collective mode with allreduce";
         multi_devices_pass =
             AppendPass("allreduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        VLOG(3) << "multi deivces collective mode with reduce";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
@@ -174,7 +177,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }
 
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
+    const std::vector<platform::Place> &places,
     const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
     const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -185,7 +189,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
@@ -203,41 +206,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
-    } else if (pass->Type() == "memory_optimize_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      const std::vector<OpDesc *> *all_op_descs =
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
-      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
-                                              all_op_descs);  // take ownership
-
-      pass->Erase(kAllOpDescs);
-      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
-
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "all_reduce_deps_pass") {
       LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                 << ", num_trainers:" << num_trainers_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
-    } else if (pass->Type() == "inplace_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      graph->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (!use_cuda) {
         LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5abadc095caeeb3173ff5cf6ec7bbd58ba29caba..8cb57ad67490c666424e8e6b07094051300431e4 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -115,7 +115,7 @@ struct BuildStrategy {
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                    const std::vector<platform::Place> &places,
                                    const std::string &loss_var_name,
                                    const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 872bc5d654cd66db821e56031d878815b653645c..f03646705817b49d6d59e8beb3d91f625dc44bef 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -24,12 +24,11 @@ namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(strategy),
       local_scopes_(local_scopes),
       places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
       pool_(strategy.num_threads_),
       prepare_pool_(1),  // add one more thread for generate op_deps
       fetch_ctxs_(places) {
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
         }
       }
       if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_.ReThrow();
       }
     }
     num_complete += num_comp;
   }
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
   return fetches;
 }
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index c3a8b85423403992e3a12ceb0a1acbae82d25dfa..970298950cc8089bc5861fcbf8dc2544934b181f 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                                const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
 
   std::unordered_map<OpHandleBase *, int> op_deps_;
   std::vector<OpHandleBase *> bootstrap_ops_;
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index db4e805bb692ee44ac50337fae54f8dbfe389e6f..0d7cbf298118722b8f32ccc5a8016ae5e168700b 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -33,10 +33,10 @@ namespace details {
 using paddle::framework::VarDesc;
 
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
+  PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
+                 "Graph has no attribute of kStaleProgramOpDescs.");
   // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
 
   // 2. topology sort order
   auto nodes = graph.Nodes();
@@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
       }
     }
   }
+
+  for (auto* op : ops_) {
+    unlived_vars_[op] = std::set<std::string>();
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
+    }
+  }
 }
 
 void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
                                            const std::string& new_node,
                                            int begin_idx) {
+  std::vector<bool> need_update(ops_.size(), false);
   // update graph from begin idx to the end
   for (size_t i = begin_idx; i != ops_.size(); ++i) {
     auto* op = ops_[i];
@@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
     if (live_in_[op].find(old_node) != live_in_[op].end()) {
       live_in_[op].erase(old_node);
       live_in_[op].insert(new_node);
+      need_update[i] = true;
     }
     if (live_out_[op].find(old_node) != live_out_[op].end()) {
       live_out_[op].erase(old_node);
       live_out_[op].insert(new_node);
+      need_update[i] = true;
+    }
+  }
+
+  for (size_t i = begin_idx; i < ops_.size(); ++i) {
+    if (!need_update[i]) continue;
+    auto* op = ops_[i];
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
     }
   }
 }
 
-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
   auto it = live_in_.find(op);
   PADDLE_ENFORCE(
       it != live_in_.end(),
@@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
   return it->second;
 }
 
-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
   auto it = live_out_.find(op);
   PADDLE_ENFORCE(
       it != live_out_.end(),
@@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
   return it->second;
 }
 
-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
   auto it = uses_.find(op);
   PADDLE_ENFORCE(
       it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+      string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
+  auto it = unlived_vars_.find(op);
+  PADDLE_ENFORCE(
+      it != unlived_vars_.end(),
+      string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
+  return it->second;
   return it->second;
 }
 
-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }
 
 std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
 
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 377367faf3c529496b00004f23159750cc2e4bc4..b5348cc66eaa446719b299b63caa340eab3e2ab9 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -92,10 +92,11 @@ class ControlFlowGraph {
   void RenameVarInCFGGraph(const std::string& old_node,
                            const std::string& new_node, int begin_idx);
 
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
+  const std::set<std::string>& LiveIn(ir::Node* op) const;
+  const std::set<std::string>& LiveOut(ir::Node* op) const;
+  const std::set<std::string>& Use(ir::Node* op) const;
+  const std::set<std::string>& Unlived(ir::Node* op) const;
+  const std::vector<ir::Node*>& Ops() const;
   std::vector<ir::Node*>& Ops();
 
   // for ssa-graph nodes
@@ -117,6 +118,7 @@ class ControlFlowGraph {
   VarSetMap live_out_;
   VarSetMap uses_;  // op inputs
   VarSetMap defs_;  // op outputs
+  std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;
 
   std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index 3cfe297a73cf4128b7191cbd432cdceadc6240ec..5389e76e0c65c7c0ee23004ca1b0a56efb4c54fe 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
   // prepare ir graph
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   ControlFlowGraph cfg(graph);
   cfg.LiveVariableAnalysis();
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto nodes = SortOpLikeDescOrder(graph);
   auto op_descs = prog.Block(0).AllOps();
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
   auto nodes = graph.Nodes();
   auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
   auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
   ir::Graph graph(prog);
 
   auto find_node_in_graph = [&](std::string s) {
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 
   // cached desc different with real one
   // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
     return ret;
   };
 
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
+
   // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
   auto nodes = graph.Nodes();
   for (auto node : nodes) {
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
     return ret;
   };
 
-  auto op_descs = prog.Block(0).AllOps();
   // add node
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index fd02bc4697e72cdd1e5af63d71931b8fe8cc29e3..e7284ea64438557161a0c97a6a7f45fb9bb245ca 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -118,13 +118,11 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
       }
     }
     // fill the pool
-    for (auto var : cfg_->LiveIn(op)) {
-      if (cfg_->LiveOut(op).count(var) == 0) {
-        ir::Node* var_node = cfg_->GetNodeByName(var, op);
-        if (var_node == nullptr || var_node->IsCtrlVar()) continue;
-        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
-          pool_.Insert(var_node);
-        }
+    for (auto& var : cfg_->Unlived(op)) {
+      ir::Node* var_node = cfg_->GetNodeByName(var, op);
+      if (var_node == nullptr || var_node->IsCtrlVar()) continue;
+      if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+        pool_.Insert(var_node);
       }
     }
   }
@@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
 
 REGISTER_PASS(memory_optimize_pass,
               paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 6c314c2e89fea841fea1413b3d01ec2dd9f6b3a3..109037c3e6bc4af9d39f31d8f6cca4f185c2435f 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -969,9 +969,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (need_broadcast_var_ ||
-      (UseGPU() &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+  // broad cast received parameters when training in parameter server mode.
+  if (need_broadcast_var_) {
+    // There are 4 conditions:
+    // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
+    // Need to broadcast received parameters to other GPU.
+    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
+    // broadcast received parameters to other GPU.
+    // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
+    // broadcast received parameters to other scope.
+    // 4. CPU && Reduce: because all parameters share the same memory, did not
+    // broadcast received parameters.
+    if (!UseGPU() &&
+        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+      return;
+    }
     if (strategy_.fuse_broadcast_op_) {
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 4c8f69c68ce17d0143c34e8adbab92cdc90058c8..5b8ae8b6770df79df309bb6be16e4f2a24ee0460 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -20,8 +20,7 @@ namespace framework {
 namespace details {
 
 std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
-    std::unique_ptr<ir::Graph> &&graph) {
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
   std::vector<std::unique_ptr<ir::Graph>> graphs;
   graphs.reserve(places_.size());
   for (size_t i = 0; i < places_.size(); ++i) {
@@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
 
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
       places_(std::move(places)),
-      main_prog_(main_prog),
       // TODO(Yancey1989): Copying graphs is not safely since it deleted the
       // attrs.
-      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
+      graphs_(SeparateMultiDevicesGraph(graph)) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
   auto seq_allreduce_pass =
       ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Erase(details::kAllOpDescs);
-  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
-      details::kAllOpDescs,
-      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
   for (size_t i = 0; i < graphs_.size(); ++i) {
     graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
   }
@@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
   }
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index 1c35d45fdd356a867d1ad80b345379395e03172e..1e421f2a3a51363fe368859f7a34593c8c894077 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           const framework::ProgramDesc &main_prog,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
   ~ParallelSSAGraphExecutor() final = default;
 
   const ir::Graph &Graph() const override { return *graphs_[0]; }
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      std::unique_ptr<ir::Graph> &&graph);
+      ir::Graph *graph);
 
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::unique_ptr<::ThreadPool> pool_{nullptr};
   std::vector<platform::Place> places_;
-  framework::ProgramDesc main_prog_;
   std::vector<std::unique_ptr<ir::Graph>> graphs_;
 
   std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index 879fb29d5926941e574d0080051c195293bc60a9..0b53a76e7877891509ea4d0334673ae2a1fcf949 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
   static std::unordered_set<std::string> skip_dist_ops{
       "send", "recv", "send_barrier", "fetch_barrier"};
 
-  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
   std::vector<ir::Node *> op_node_list;
   op_node_list.reserve(ops.size());
 
@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
 
 REGISTER_PASS(sequential_execution_pass,
               paddle::framework::details::SequentialExecutionPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c9182afbc0fb5b9b58d3b8f1e4f28c37765589ad..84366263629361ce0f34869201c2df0b8b0d69c0 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
-    : graph_(std::move(graph)),
+    const std::vector<platform::Place> &places, ir::Graph *graph)
+    : graph_(graph),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
       local_scopes_(local_scopes),
@@ -123,7 +122,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_holder_.ReThrow();
       } else {
         continue;
@@ -148,7 +147,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
   }
   PADDLE_ENFORCE(ready_ops.empty());
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
 
   return fetch_data;
 }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index ae9cb1ebca4f6df33a364926614acb82e6405438..923e940884555ae71ac8047be1a181531d21f356 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -56,7 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 07c2c970d4de3cecf03e4cf80e60e81e7a9595a8..25d9afbcc8b2bc89ec47654f0dba4cb838be55b0 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index a756dfc1b98e1de55c809c73e2c4df1e628950ae..39b0585d3a6f9b52c9ec4b0a24f8532a3410851a 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index ad966e11e6222a4ed4c730089c454b0d1c7bd0b3..8c3c8b56c08cc09e66b20d17bf730edec0499f35 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
   virtual ~ConvAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_affine_channel_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 2c9eb574fe8e054e0ae221f08f664b91f05d95c9..cf425a2730904d4ab21c33e66b72db0692cb087c 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
   virtual ~ConvBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bn_fuse"};
 };
 
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
   virtual ~ConvEltwiseAddBNFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index 3b40a5a92665c07bc2b66e6a96721f573d40393f..9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAdd2ActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index ac69aa6458fc8c19b670dea2af1251c44dc353a8..9c0b50f155821cf2bd815a6fb087e3f6cc513641 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddActFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index f234603f5856a9238164f7fb0e5cc81ea9b7ed60..bf43bd5ce2602a3e240c56f00f66f13b79151002 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
   virtual ~ConvElementwiseAddFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index e5ad3067ec4060e41f1464395f3fc76183de3e66..fde2a0a4eecdec9ad5ac58ad8e63c26cce482682 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
   virtual ~EmbeddingFCLSTMFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"embedding_fc_lstm_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index 6c69539d1e48268afc2435f8f73b3818d13107cd..783a052edcf84c8c437a7b2e25f0d67c0366691e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
   virtual ~FCFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index 63e1c72bfb2e2641ae5d44858b342d5e427e9045..e359a3289440fffbec622488ecf3a7f49e986574 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
   virtual ~FCGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_gru_fuse"};
 };
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
   virtual ~MulGRUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_gru_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9..21482615a6efef930b7328594477a51f4aaf28e7 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
   virtual ~FCLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"fc_lstm_fuse"};
 };
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
   virtual ~MulLstmFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index b2fecc076efca333539fe81e67eee222873aee2a..0fee5274478e8b8db852774077ff5979f0aaba25 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
   virtual ~FuseElewiseAddActPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   std::unique_ptr<ir::Graph> FuseElewiseAddAct(
       std::unique_ptr<ir::Graph> graph,
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index 6bd653775e42c1ee16051e205e9fa9888ea05eaa..efb49b8300e677f17d9e205800d837b88edfd2e9 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
   virtual ~FuseReluDepthwiseConvPass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
       std::unique_ptr<ir::Graph> graph, bool only_forward) const;
 };
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 4b5c846f3271b2dd5e094020571069aff590cd2b..5e954fa9c419b249bb8a4be5a78c01da85b017b2 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
       var->inputs.push_back(node);
     }
   }
+  Set<const std::vector<OpDesc *>>(
+      details::kStaleProgramOpDescs,
+      new std::vector<OpDesc *>(program.Block(0).AllOps()));
   return var_nodes;
 }
 
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 296f3b83961c1379ee2c1237aa15784791b46878..cfd974e4bd679fdd06739f4c943bb197865020fb 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -31,7 +31,7 @@ namespace details {
 
 // This attr is not recommended, because the graph should not dependence
 // the program once it is built.
-constexpr char kAllOpDescs[] = "all_op_descs";
+constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
 }  //  namespace details
 
 namespace ir {
@@ -195,6 +195,12 @@ class Graph {
     return nullptr;
   }
 
+  // Returns reference to the original program.
+  // WARN: After a series of passes, the current graph can be quite
+  // different from OriginProgram. Caller shouldn't assume much from
+  // the returned OriginProgram.
+  const ProgramDesc &OriginProgram() const { return program_; }
+
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index 50a654d82f0e4fb7e8e91c665397716407e6d2a5..6da592561da1e4046acbfd86c04862f69b7a97a8 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -22,7 +22,8 @@ namespace ir {
 
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   virtual ~IdentityScaleOpCleanPass() = default;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 7310f596f8a3170e84840be4bab8390b780b6577..f9157b10d9554092a5da6a6f73ecf7ceac1430dd 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
   virtual ~LockFreeOptimizePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
   // Create a new sgd node via current optimizer node
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index f3ad9f1c2bf14db418629e0c607e2510f01908b8..0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
   virtual bool is_conv3d() const { return false; }
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38b7fe52037c1a264e4251b7a54ef7569ee6d765
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+  } else if (type == "elementwise_add") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  }
+  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+// (c, weights)->conv->f
+// (f)->elementwise_add->g
+ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // conv+bias, both with MKL-DNN
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}));
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+  SetOp(&prog, "elementwise_add", "eltwise",
+        std::vector<std::string>({"f", "eltwise_bias"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(bool convWithExistingBias) {
+  auto prog = BuildProgramDesc(convWithExistingBias);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  // Init scope, as it is used in pass
+  exe.CreateVariables(prog, 0, true, &scope);
+  if (convWithExistingBias) {
+    InitTensorHolder(&scope, place, "conv_bias");
+    InitTensorHolder(&scope, place, "eltwise_bias");
+  }
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: Conv, Bias, conv_out
+  // Add 1 Node: ConvBias
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_bias op in newly generated graph
+  int conv_bias_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      // check if "conv" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv") {
+        auto input_names = op->InputNames();
+        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end());
+        auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
+        if (bias.size()) {
+          ++conv_bias_count;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_bias_count, 1);
+}
+
+TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
+
+TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
+
+TEST(ConvBiasFusePass, conv3d) {
+  Conv3DBiasFusePass pass;
+  ASSERT_TRUE(pass.is_conv3d());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index 3f3f0846eba1201e57a653f8e515c28d2bcdd5e3..ede0bea07ff4130a0f6b3d21d6e34222a5013170 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
   virtual ~RepeatedFCReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"repeated_fc_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index 9f5fd1a29adf918806d8f30097d8c7f002f48f3e..06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
   virtual ~SeqConcatFcFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index dac9de71930c1768bdf416520caae6468449cd3d..c36c6b76a238dd21eb0c9308e780761aa9e4e27a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
   virtual ~SeqConvEltAddReluFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index ba2154045e62c687173565c5ad30ea4d45d3c8f4..a5db3528da36ad08bb7f4d2765ee78222c569a5c 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
   virtual ~SeqPoolConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"seqpool_concat_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index fb49adc3768ec99cab4321c6b90c93dfed6d32f2..c21ba65c40a8d54c315ab347e5a8a3266a143779 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
   virtual ~SquaredMatSubFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
   const std::string name_scope_{"squared_mat_sub_fuse"};
 };
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index fb0f0ae9efdc5a25a799d6123fa658a99860cd86..a7d18ec86da1c02aef84c25c378691eb8f651015 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
   virtual ~TransposeFlattenConcatFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..5f3ce60e1d94ea4078cf0b709df362bad317f621 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -27,7 +27,7 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
-  // RPC role is for send/recv releated op
+  // RPC role is for send/recv related op
   kRPC = 0x0004,
   // Dist role is for split_byref/split_selected_rows/concat
   // used for distributed training.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9a0348871b050278da2ad07ac6992188a702da42..64592d73e1741c2bc93a2c90b58b1824b2c887f9 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
+std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
+    const OpKernelType& key) const {
+  auto config_iter = kernel_configs_map_.find(key);
+  std::vector<KernelConfig>* kernel_configs = nullptr;
+  if (config_iter != kernel_configs_map_.end()) {
+    kernel_configs = &(config_iter->second);
+  }
+  return kernel_configs;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeContext ctx(Inputs(), Outputs(), scope);
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
+  std::vector<KernelConfig>* kernel_configs =
+      GetKernelConfig(expected_kernel_key);
+
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
   auto* transfer_scope =
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
+  kernel_iter->second(
+      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e33214b44bb5d8ea5eb32d442d597a369c198bdd..8a86813e9362d7b82c2023428a35a1982adb0508 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -184,12 +185,30 @@ class OperatorBase {
                        const platform::Place& place) const = 0;
 };
 
+#ifdef PADDLE_WITH_CUDA
+using KernelConfig = boost::variant<
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
+#else
+using KernelConfig = boost::variant<boost::blank>;
+#endif
+
+using OpKernelConfigsMap =
+    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
+                       OpKernelType::Hash>;
+
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
                    const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx)
-      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
+                   const RuntimeContext& ctx,
+                   std::vector<KernelConfig>* configs)
+      : op_(op),
+        scope_(scope),
+        device_context_(device_context),
+        ctx_(ctx),
+        kernel_configs_(configs) {}
 
   const OperatorBase& op() const { return op_; }
 
@@ -398,11 +417,20 @@ class ExecutionContext {
     return temp_tensor;
   }
 
+  template <typename T>
+  T& GetKernelConfig(int idx) const {
+    PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx,
+                   "%s selected kernel doesn't have kernel config %lu <= %d",
+                   op_.Type().c_str(), kernel_configs_->size(), idx);
+    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
   const platform::DeviceContext& device_context_;
   const RuntimeContext& ctx_;
+  mutable std::vector<KernelConfig>* kernel_configs_;
 };
 
 template <>
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
 
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
 
+  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
+
  protected:
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
   void TransferInplaceVarsBack(const Scope& scope,
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
+
+ protected:
+  mutable OpKernelConfigsMap kernel_configs_map_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
new file mode 100644
index 0000000000000000000000000000000000000000..c520c222350ceeef246dae756a7157872ae087fa
--- /dev/null
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+// Not thread-safe. Should be owned per-kernel.
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  int search_times_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 129d3a7f0d3580e3df0645239ca015d261e83f94..a498ec5b0b5406b243cca74e8ef14888865324bb 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -185,9 +185,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 ParallelExecutor::ParallelExecutor(
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &bcast_vars,
-    const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
+    const std::string &loss_var_name, Scope *scope,
+    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -221,12 +222,13 @@ ParallelExecutor::ParallelExecutor(
     PADDLE_ENFORCE(!member_->use_cuda_,
                    "gpu mode does not support async_mode_ now!");
   }
+  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
 
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.
-  build_strategy.enable_parallel_graph_ =
-      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+  build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(
+      *temp_owned_graph, exec_strategy, build_strategy);
   if (build_strategy.enable_parallel_graph_)
     VLOG(0) << "The Executor would execute the graph by ParallelGraph "
                "Execution which can get better performance,"
@@ -260,41 +262,45 @@ ParallelExecutor::ParallelExecutor(
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-  // Startup Program has been run. All local scopes has correct parameters.
+// Startup Program has been run. All local scopes has correct parameters.
 
-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::unique_ptr<ir::Graph> graph;
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    graph =
-        build_strategy.Apply(main_program, {member_->places_[0]}, loss_var_name,
+    temp_owned_graph =
+        build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]}, loss_var_name,
                              {member_->local_scopes_[0]}, member_->nranks_,
                              member_->use_cuda_, member_->nccl_ctxs_.get());
   } else {
-    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
                                  member_->local_scopes_, member_->nranks_,
                                  member_->use_cuda_, member_->nccl_ctxs_.get());
   }
 #else
   if (build_strategy.async_mode_ && !build_strategy.is_distribution_) {
     VLOG(3) << "use local async mode";
-    graph = build_strategy.Apply(main_program, {member_->places_[0]},
+    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), {member_->places_[0]},
                                  loss_var_name, {member_->local_scopes_[0]},
                                  member_->nranks_, member_->use_cuda_);
   } else {
-    graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+    temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
                                  member_->local_scopes_, member_->nranks_,
                                  member_->use_cuda_);
   }
+
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    graph = member_
+                ->PrepareGCAndRefCnts(std::move(temp_owned_graph),
+                                      static_cast<size_t>(max_memory_size))
+                .release();
+  } else {
+    graph = temp_owned_graph.release();
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
@@ -328,15 +334,14 @@ ParallelExecutor::ParallelExecutor(
     VLOG(3) << "use AsyncSSAGraphExecutor";
     member_->executor_.reset(new details::AsyncSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graph)));
+        graph));
   } else if (build_strategy.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
 #ifdef PADDLE_WITH_CUDA
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_, main_program,
-        std::move(graphs[0])));
+        exec_strategy, member_->local_scopes_, member_->places_, graph));
 #else
     PADDLE_THROW(
         "Paddle should be compiled with CUDA for ParallelGraph Execution.");
@@ -345,13 +350,11 @@ ParallelExecutor::ParallelExecutor(
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       VLOG(3) << "use ThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
     } else {
       VLOG(3) << "use FastThreadedSSAGraphExecutor";
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
     }
   }
 
@@ -491,24 +494,33 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
+ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  delete member_;
+}
+
 bool ParallelExecutor::EnableParallelGraphExecution(
-    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
     const BuildStrategy &build_strategy) const {
   if (!FLAGS_enable_parallel_graph) return false;
 
   bool enable_parallel_graph = true;
-  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-  for (auto &var_desc : main_program.Block(0).AllVars()) {
-    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
-      enable_parallel_graph = false;
-    }
-  }
 
-  // TODO(Yancey1989): support pserver mode
-  for (auto &op_desc : main_program.Block(0).AllOps()) {
-    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
-      enable_parallel_graph = false;
-      break;
+  for (ir::Node *node : graph.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
+        enable_parallel_graph = false;
+        break;
+      }
+    } else if (node->IsOp() && node->Op()) {
+      // TODO(Yancey1989): support pserver mode
+      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
+        enable_parallel_graph = false;
+        break;
+      }
     }
   }
 
@@ -520,13 +532,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 121bbd55ad575477424a2fb12baab82585eae517..ddf60b39466e72822142e1dad2cfe9a97b6cf6f2 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -46,11 +46,11 @@ class ParallelExecutor {
  public:
   explicit ParallelExecutor(const std::vector<platform::Place> &places,
                             const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy);
+                            const BuildStrategy &build_strategy,
+                            ir::Graph *graph);
 
   ~ParallelExecutor();
 
@@ -71,7 +71,7 @@ class ParallelExecutor {
 
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
+  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 40606d9b06baf4dbebf87f3c02580e49ae6e2a70..88f5b757a8111f6a7e269ff71054dab425c0de01 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_utils.h"
+#endif
+
 namespace paddle {
 
 namespace framework {
@@ -37,10 +41,34 @@ class Tensor {
 #ifdef PADDLE_WITH_MKLDNN
 
  public:
-  inline mkldnn::memory::format format() const { return format_; }
+  // TODO(jczaja): This is depracted and will be removed
+  inline mkldnn::memory::format format() const {
+    if (layout_ == DataLayout::kMKLDNN) {
+      return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
+    } else {
+      return mkldnn::memory::format::format_undef;
+    }
+  }
 
-  inline void set_format(const mkldnn::memory::format format) {
-    format_ = format;
+  // TODO(jczaja): This is depracted and will be removed
+  inline void set_format(
+      const mkldnn::memory::format fmt,
+      mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
+    mem_pd_ = paddle::platform::create_prim_desc_from_format(
+        paddle::framework::vectorize2int(dims()), fmt, data_type);
+    layout_ = DataLayout::kMKLDNN;
+  }
+
+  inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
+    return mem_pd_;
+  }
+
+  inline void set_mkldnn_prim_desc(
+      const mkldnn::memory::primitive_desc& mem_pd) {
+    // Internally MKL-DNN is just copying (increasing reference counter)
+    // to shared_ptr. So asignment should be quite cheap
+    mem_pd_ = mem_pd;
+    layout_ = DataLayout::kMKLDNN;
   }
 
  protected:
@@ -48,12 +76,9 @@ class Tensor {
    * @brief the detail format of memory block which have layout as kMKLDNN
    *
    * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
+   *       nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
    */
-
-  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+  mutable mkldnn::memory::primitive_desc mem_pd_;
 #endif
 
  public:
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 733542e4972b16a71f9e76c3076b424b7a901066..fa77b96a7bdfa28ed982db022e8e5ecaef0b443c 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -50,8 +50,6 @@ class Scope;
 }  // namespace framework
 
 namespace operators {
-template <typename T>
-class AlgorithmsCache;
 
 class CudnnRNNCache;
 
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #ifndef _WIN32
     ncclUniqueId, platform::Communicator,
 #endif
-    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
     operators::CudnnRNNCache,
 #endif
     int, float>;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 8f20f0c06e043ddc629e47c6e49280c5467b0e20..aff5cf24be7c41cf58929069768d4fdb34386ae6 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
-      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+      p.func(
+          framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
     }
   }
 
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 78205486c5534ac0c61cc6d545bdafa4dfc95695..bbf614831ca817031b209fffec043495ea24d10f 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -44,8 +44,13 @@ class PreparedOp {
   PreparedOp(const framework::OperatorBase& op,
              const framework::RuntimeContext& ctx,
              framework::OperatorWithKernel::OpKernelFunc func,
-             platform::DeviceContext* dev_ctx)
-      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+             platform::DeviceContext* dev_ctx,
+             std::vector<framework::KernelConfig>* kernel_configs)
+      : op(op),
+        ctx(ctx),
+        func(func),
+        dev_ctx(dev_ctx),
+        kernel_configs(kernel_configs) {}
 
   static PreparedOp Prepare(const framework::RuntimeContext& ctx,
                             const framework::OperatorWithKernel& op,
@@ -64,8 +69,9 @@ class PreparedOp {
 
     framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
 
-    auto expected_kernel_key = op.GetExpectedKernelType(
-        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    auto expected_kernel_key =
+        op.GetExpectedKernelType(framework::ExecutionContext(
+            op, framework::Scope(), *dev_ctx, ctx, nullptr));
     VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
     auto kernel_iter = kernels.find(expected_kernel_key);
@@ -83,7 +89,9 @@ class PreparedOp {
       PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
                    KernelTypeToString(expected_kernel_key));
     }
-    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+    std::vector<framework::KernelConfig>* kernel_configs =
+        op.GetKernelConfig(expected_kernel_key);
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
   }
 
   inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
@@ -92,6 +100,7 @@ class PreparedOp {
   const framework::RuntimeContext& ctx;
   framework::OperatorWithKernel::OpKernelFunc func;
   platform::DeviceContext* dev_ctx;
+  std::vector<framework::KernelConfig>* kernel_configs;
 };
 
 class OpBase;
@@ -105,23 +114,23 @@ class VarBase {
  public:
   VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
 
-  // Owns `var` and `grad`
+  explicit VarBase(bool stop_gradient)
+      : VarBase(new framework::Variable(),
+                stop_gradient ? nullptr : new VarBase(true), stop_gradient) {}
+
   VarBase(framework::Variable* var, VarBase* grad)
+      : VarBase(var, grad, false) {}
+
+ private:
+  VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient)
       : var_desc_(nullptr),
         var_(var),
         grads_(grad),
-        stop_gradient_(false),
-        pre_op_(nullptr),
-        pre_op_out_idx_(-1) {}
-
-  explicit VarBase(bool stop_gradient)
-      : var_desc_(nullptr),
-        var_(new framework::Variable()),
-        grads_(stop_gradient ? nullptr : new VarBase(true)),
         stop_gradient_(stop_gradient),
         pre_op_(nullptr),
         pre_op_out_idx_(-1) {}
 
+ public:
   virtual ~VarBase() {
     if (var_) {
       delete var_;
@@ -132,11 +141,13 @@ class VarBase {
     }
   }
 
-  OpBase* PreOp() const { return pre_op_; }
-  int PreOpOutIdx() const { return pre_op_out_idx_; }
+  inline OpBase* PreOp() const { return pre_op_; }
+  inline int PreOpOutIdx() const { return pre_op_out_idx_; }
 
-  void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; }
-  bool IsStopGradient() const { return stop_gradient_; }
+  inline void SetStopGradient(bool stop_gradient) {
+    stop_gradient_ = stop_gradient;
+  }
+  inline bool IsStopGradient() const { return stop_gradient_; }
 
   void RunBackward();
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index bc39d11ba00a6a7c386162a1f9201c6f992c8692..2993ab309027f9306c61023b55b1c061e0ebddc0 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
+#include <set>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -66,16 +68,18 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
   return result;
 }
 
-void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                   const VarBasePtrMap& outputs, framework::BlockDesc* block,
-                   const platform::Place expected_place,
-                   const bool stop_gradient) {
+std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                                    const VarBasePtrMap& outputs,
+                                    framework::BlockDesc* block,
+                                    const platform::Place expected_place,
+                                    const bool stop_gradient) {
   std::map<std::string, VarBase*> vars;
 
   framework::OpDesc* op_desc = op->op_desc_;
   VLOG(3) << "tracer tracing " << op_desc->Type();
   op_desc->InferShape(*block);
   op_desc->InferVarType(block);
+
   std::unique_ptr<framework::OperatorBase> op_base =
       framework::OpRegistry::CreateOp(*op_desc);
 
@@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
 
       invars.emplace_back(inp->var_);
       vars[inp->var_desc_->Name()] = inp;
-      if (inp->PreOp()) {
+      if (inp->PreOp() && !inp->IsStopGradient()) {
         op->pre_ops_[it.first].push_back(inp->PreOp());
         op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
       } else {
@@ -138,8 +142,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   op->place_ = GetExpectedPlace(expected_place, inputs);
   PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
   prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
-  prepared_op.func(framework::ExecutionContext(
-      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
+  prepared_op.func(
+      framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
+                                  prepared_op.ctx, prepared_op.kernel_configs));
+
+  std::set<std::string> vars_saved_for_backward;
 
   if (!stop_gradient) {
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
@@ -160,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
             PADDLE_ENFORCE(fwd_var_it != vars.end());
             // Forward inputs or outputs.
             grad_in_vars.push_back(fwd_var_it->second->var_);
+            vars_saved_for_backward.insert(it.first);
           } else {
             VarBase* var = vars[var_it->second];
             if (!var->grads_->var_->IsInitialized()) {
@@ -193,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   }
 
   op->block_ = block;
+  return vars_saved_for_backward;
 }
 
 std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
@@ -202,7 +211,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   op->input_vars_[PyLayer::kFwdInp] = inputs;
   op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
   for (VarBase* inp : inputs) {
-    if (inp->PreOp()) {
+    if (inp->PreOp() && !inp->IsStopGradient()) {
       op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
       op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
     } else {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 690838215581b09ff35a0ea13f30655b77e6e187..98909e378f0e4188250fcb6efd9502dcc9740da4 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -43,10 +44,11 @@ class Tracer {
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             const VarBasePtrMap& outputs, framework::BlockDesc* block,
-             const platform::Place expected_place,
-             const bool stop_gradient = false);
+  std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
+                              const VarBasePtrMap& outputs,
+                              framework::BlockDesc* block,
+                              const platform::Place expected_place,
+                              const bool stop_gradient = false);
 
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e92273b4dd94f11e0e90c91fd82dafe42bf158f3..522ab495227e9b8c52b8d38db696fa9b785ba642 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(params_file_);
   CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
                                   // params_file_ fields.
-  // Gpu releated.
+  // Gpu related.
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(enable_memory_optim_);
   CP_MEMBER(static_memory_optim_);
   CP_MEMBER(static_memory_optim_force_update_);
-  // TensorRT releated.
+  // TensorRT related.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
-  // MKLDNN releated.
+  // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 712e010db4340bde55945f89c488ba8cc38a1926..e8964c4acea0d220deca048a018eb7de42d7e4e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu()) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
     PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                       config.gpu_device_id());
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   return need;
 }
 
-std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 014df4ee8b6d86232212736c43a9aff32ffee011..d5445c58e45ae64a8cfab03cb610e3677729338b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
 
   void SetMkldnnThreadID(int tid);
 
-  std::string GetSeriazlizedProgram() const override;
+  std::string GetSerializedProgram() const override;
 
  protected:
   // For memory optimization.
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 002ba90e40e69d565f5a54e374a3f0083b84273f..6696839b53fb21c274843afd86b5d8b5c2042c51 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
   {
     // The first predictor help to cache the memory optimize strategy.
     auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
-    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
+    LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
+    ASSERT_FALSE(predictor->GetSerializedProgram().empty());
 
     // Run several times to check the parameters are not reused by mistake.
     for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e18bc02d92eb517fa20dc83811694b8ac80ae316..97c164bdef7a4b3e66be78526793f3830ada398b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
-    // 1. GPU memeroy
+    // 1. GPU memory
     PADDLE_ENFORCE_GE(
         config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 47361b3279e14dd65a0e6e7f864e508ef1183045..c1c6227cdd8b2042f6765c7932327ecae246c260 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -212,12 +212,12 @@ struct AnalysisConfig {
   std::string prog_file_;
   std::string params_file_;
 
-  // GPU releated.
+  // GPU related.
   bool use_gpu_{false};
   int device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
 
-  // TensorRT releated.
+  // TensorRT related.
   bool use_tensorrt_{false};
   // For workspace_size, refer it from here:
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index f90a74b9102ee62d15c2d738b53971c9bde51439..c9a45b4aa3b4037d3725622fc960848bc1ccfb2c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -248,7 +248,7 @@ class PaddlePredictor {
   /** \brief Get the serialized model program that executes in inference phase.
    * Its data type is ProgramDesc, which is a protobuf message.
    */
-  virtual std::string GetSeriazlizedProgram() const {
+  virtual std::string GetSerializedProgram() const {
     assert(false);  // Force raise error.
     return "NotImplemented";
   }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7ecd9e35332843e3a391cdad5ce32220d890abd1..55ab04bfe16ec6a3d97c443f59c72e7b85fb1899 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
+# TODO(luotao, Superjom) Disable DAM test, temporarily fix
+# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
+# After inference framework refactor, will reopen it.
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
+#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e099425b94221bf1229e936fc1781615d13dbc26..a3f2a69aef52b6f55aa09e6dee2c22c048626c0d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+add_subdirectory(benchmark)
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 7f2bde55c98277b9fd4b3374657001c42d673d43..cf78c83297a87beb08a8b8e6e4b182f03f1909d3 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     auto& dev_ctx = *pool.Get(dev_place);
 
     framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
     const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 6aefc5446f167eebb0da673b3fbdf7ed128daa98..0b883c3158fb922caae2e731875bbb8d43a1e9ca 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 
   auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
       new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place.get());
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
 
   framework::LoD lod;
   lod.push_back(source_level_lod);
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54008336a9f67f0123ba1cfa6fcea35b79b7ac4c
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
+        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e179de56cddc5fada2e5833086d351659a7cf540
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester.h"
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+DEFINE_string(op_config_list, "", "Path of op config file.");
+
+void OpTester::Init(const std::string &filename) {
+  Init(OpTesterConfig(filename));
+}
+
+void OpTester::Init(const OpTesterConfig &config) {
+  config_ = config;
+
+  auto &op_desc_info = framework::OpInfoMap::Instance();
+  // Initialize the OpDesc
+  if (op_desc_info.Has(config_.op_type)) {
+    type_ = config_.op_type;
+    op_desc_.SetType(config_.op_type);
+
+    CreateInputVarDesc();
+    CreateOutputVarDesc();
+  } else {
+    LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
+  }
+
+  if (config_.device_id >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device_id);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+
+  framework::InitDevices(false);
+  scope_.reset(new paddle::framework::Scope());
+
+  op_ = framework::OpRegistry::CreateOp(op_desc_);
+  CreateVariables(scope_.get());
+}
+
+void OpTester::Run() {
+  if (config_.print_debug_string) {
+    LOG(INFO) << DebugString();
+  }
+
+  // Warm up
+  RunImpl();
+
+  platform::Timer timer;
+  if (config_.profile) {
+    if (platform::is_cpu_place(place_)) {
+      platform::EnableProfiler(platform::ProfilerState::kCPU);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      platform::EnableProfiler(platform::ProfilerState::kAll);
+      platform::SetDeviceId(config_.device_id);
+#else
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+    }
+
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+    platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                              "op_tester_profiler");
+  } else {
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+  }
+  config_.runtime = timer.ElapsedMS() / config_.repeat;
+  LOG(INFO) << "=== Run " << config_.repeat
+            << " times, latency: " << config_.runtime << " ms ===";
+}
+
+void OpTester::RunImpl() {
+  op_->Run(*scope_, place_);
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  scope_->DropKids();
+}
+
+std::vector<std::string> OpTester::GetOpProtoInputNames() {
+  std::vector<std::string> input_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.inputs_size(); ++i) {
+    const auto &input = proto.inputs(i);
+    input_names.push_back(input.name());
+  }
+  return input_names;
+}
+
+std::vector<std::string> OpTester::GetOpProtoOutputNames() {
+  std::vector<std::string> output_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.outputs_size(); ++i) {
+    const auto &output = proto.outputs(i);
+    output_names.push_back(output.name());
+  }
+  return output_names;
+}
+
+void OpTester::CreateInputVarDesc() {
+  std::vector<std::string> input_names = GetOpProtoInputNames();
+  for (auto &name : input_names) {
+    const OpInputConfig *input = config_.GetInput(name);
+    if (input == nullptr) {
+      LOG(FATAL) << "The input " << name << " of op " << config_.op_type
+                 << " is not correctlly provided.";
+    }
+
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetShape(input->dims);
+
+    op_desc_.SetInput(name, {var_name});
+    inputs_.push_back(var_name);
+  }
+}
+
+void OpTester::CreateOutputVarDesc() {
+  std::vector<std::string> output_names = GetOpProtoOutputNames();
+  for (auto &name : output_names) {
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+
+    op_desc_.SetOutput(name, {var_name});
+    outputs_.push_back(var_name);
+  }
+}
+
+framework::VarDesc *OpTester::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new framework::VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+template <typename T>
+void OpTester::SetupTensor(framework::LoDTensor *tensor,
+                           const std::vector<int64_t> &shape, T lower,
+                           T upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
+  if (platform::is_cpu_place(place_)) {
+    for (int i = 0; i < tensor->numel(); ++i) {
+      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+  } else {
+    framework::LoDTensor cpu_tensor;
+    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+                                            platform::CPUPlace());
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+    TensorCopySync(cpu_tensor, place_, tensor);
+  }
+}
+
+void OpTester::CreateVariables(framework::Scope *scope) {
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    auto *ptr = scope->Var(var->Name());
+    framework::InitializeVariable(ptr, var->GetType());
+    if (var->Persistable()) {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " global, which pointer is " << ptr;
+    } else {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " locally, which pointer is " << ptr;
+    }
+  }
+
+  // Allocate memory for input tensor
+  for (auto &name : inputs_) {
+    VLOG(3) << "Allocate memory for tensor " << name;
+    auto &var_desc = vars_[name];
+    std::vector<int64_t> shape = var_desc->GetShape();
+
+    auto *var = scope->Var(name);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
+                       static_cast<float>(1.0));
+  }
+}
+
+static std::string GenSpaces(int count) {
+  std::stringstream ss;
+  for (int i = 0; i < count; ++i) {
+    ss << "  ";
+  }
+  return ss.str();
+}
+
+std::string OpTester::DebugString() {
+  std::stringstream ss;
+  int count = 0;
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    ss << GenSpaces(count++) << "vars {\n";
+    ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
+    ss << GenSpaces(count++) << "type: {\n";
+    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
+    ss << GenSpaces(count++) << "lod_tensor {\n";
+    ss << GenSpaces(count++) << "tensor {\n";
+    ss << GenSpaces(count) << "data_type: FP32\n";
+    std::vector<int64_t> shape = var->GetShape();
+    for (auto d : shape) {
+      ss << GenSpaces(count) << "dims: " << d << "\n";
+    }
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count++) << "ops {\n";
+  for (auto &name : op_desc_.InputNames()) {
+    ss << GenSpaces(count++) << "inputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  for (auto &name : op_desc_.OutputNames()) {
+    ss << GenSpaces(count++) << "outputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  ss << GenSpaces(--count) << "}\n";
+  return ss.str();
+}
+
+TEST(op_tester, base) {
+  OpTester tester;
+  if (!FLAGS_op_config_list.empty()) {
+    tester.Init(FLAGS_op_config_list);
+  } else {
+    OpTesterConfig config;
+    config.op_type = "elementwise_add";
+    config.inputs.resize(2);
+    config.inputs[0].name = "X";
+    config.inputs[0].dims = {64, 64};
+    config.inputs[1].name = "Y";
+    config.inputs[1].dims = {64, 1};
+    tester.Init(config);
+  }
+  tester.Run();
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
new file mode 100644
index 0000000000000000000000000000000000000000..1723d46c47ed67199713e6d726c6245f34f7c224
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+class OpTester {
+ public:
+  OpTester() {}
+
+  void Init(const std::string &filename);
+  void Init(const OpTesterConfig &config);
+
+  void Run();
+
+  std::string DebugString();
+
+ private:
+  std::vector<std::string> GetOpProtoInputNames();
+  std::vector<std::string> GetOpProtoOutputNames();
+
+  void CreateInputVarDesc();
+  void CreateOutputVarDesc();
+
+  framework::VarDesc *Var(const std::string &name);
+  void CreateVariables(framework::Scope *scope);
+
+  template <typename T>
+  void SetupTensor(framework::LoDTensor *input,
+                   const std::vector<int64_t> &shape, T lower, T upper);
+
+  void RunImpl();
+
+ private:
+  OpTesterConfig config_;
+  std::string type_;
+  framework::OpDesc op_desc_;
+  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  platform::Place place_;
+  std::unique_ptr<framework::Scope> scope_;
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3db8de7f76801eb814b57859d6b95590761c96f3
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+#include <fstream>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+static const char kStartSeparator[] = "{";
+static const char kEndSeparator[] = "}";
+static const char kSepBetweenItems[] = ";";
+
+static bool StartWith(const std::string& str, const std::string& substr) {
+  return str.find(substr) == 0;
+}
+
+static bool EndWith(const std::string& str, const std::string& substr) {
+  return str.rfind(substr) == (str.length() - substr.length());
+}
+
+static void EraseEndSep(std::string* str) {
+  std::string substr = kSepBetweenItems;
+  if (EndWith(*str, substr)) {
+    str->erase(str->length() - substr.length(), str->length());
+  }
+}
+
+static std::vector<int64_t> ParseDims(std::string dims_str) {
+  std::vector<int64_t> dims;
+  std::string token;
+  std::istringstream token_stream(dims_str);
+  while (std::getline(token_stream, token, 'x')) {
+    dims.push_back(std::stoi(token));
+  }
+  return dims;
+}
+
+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dims" || sep == "dims:") {
+        std::string dims_str;
+        is >> dims_str;
+        dims = ParseDims(dims_str);
+      }
+    }
+  }
+}
+
+OpTesterConfig::OpTesterConfig(const std::string& filename) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                 filename.c_str());
+
+  Init(fin);
+}
+
+void OpTesterConfig::Init(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "op_type" || sep == "op_type:") {
+        is >> op_type;
+      } else if (sep == "device_id" || sep == "device_id:") {
+        is >> device_id;
+      } else if (sep == "repeat" || sep == "repeat:") {
+        is >> repeat;
+      } else if (sep == "profile" || sep == "profile:") {
+        is >> profile;
+      } else if (sep == "print_debug_string" || sep == "print_debug_string:") {
+        is >> print_debug_string;
+      } else if (sep == "input" || sep == "input:") {
+        OpInputConfig input_config(is);
+        inputs.push_back(input_config);
+      }
+    }
+  }
+}
+
+const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].name == name) {
+      return &inputs[i];
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b62cb8ad03b410a2ea99fe4c2a8dc8a6bea7a7
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+struct OpInputConfig {
+  OpInputConfig() {}
+  explicit OpInputConfig(std::istream& is);
+
+  std::string name;
+  std::vector<int64_t> dims;
+};
+
+struct OpTesterConfig {
+  OpTesterConfig() {}
+  explicit OpTesterConfig(const std::string& filename);
+  void Init(std::istream& is);
+
+  const OpInputConfig* GetInput(const std::string& name);
+
+  std::string op_type;
+  std::vector<OpInputConfig> inputs;
+  int device_id{-1};  // CPU: -1
+  int repeat{1};
+  int profile{0};
+  int print_debug_string{0};
+  double runtime{0.0};
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index f5208e7a601f4dd33b486e5840178022f66431e5..9e5ccd928e9d6012c1da3baa17521dcac0c8ff2f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+using framework::AlgorithmsCache;
 
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else if (exhaustive_search && (!half_float)) {
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-        algo_cache =
-            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      }
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
       cudnn_workspace =
           ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
               framework::make_ddim(
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
               dev_ctx);
       cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
 
-      algo = algo_cache->GetAlgorithm(
+      algo = algo_cache.GetAlgorithm(
           x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
             int returned_algo_count;
             std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
-          data_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        } else {
-          data_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        }
-
-        data_algo = data_algo_cache->GetAlgorithm(
+        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>& data_algo_cache =
+            ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>(
+                0);
+
+        data_algo = data_algo_cache.GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdDataAlgoPerf_t,
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
-          f_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        } else {
-          f_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        }
-
-        filter_algo = f_algo_cache->GetAlgorithm(
+        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>& f_algo_cache =
+            ctx.GetKernelConfig<
+                AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>(1);
+
+        filter_algo = f_algo_cache.GetAlgorithm(
             x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
               int returned_algo_count;
               std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index f172431e483f38665251617e6fcfddb4bcc0d9d4..de92b75a501dfc300bb8b52ebfa7903995847218 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif
 
-template <typename TAlgorithm>
-class AlgorithmsCache {
- public:
-  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
-  // Caches the best algorithm for a given
-  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
-
-  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
-                          std::function<TAlgorithm()> gen_func);
-
- private:
-  std::unordered_map<int64_t, TAlgorithm> hash_;
-  std::mutex mutex_;
-
-  int search_times_;
-};
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  int64_t seed = 0;
-  // Hash all of the inputs, use to try and look up a previously
-  // discovered algorithm, or fall back to generating a new one.
-  std::hash<int64_t> hashFn;
-  // do hash like boost
-  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-  for (const auto num : dims1) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-
-  for (const auto num : dims2) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
-  }
-
-  for (const auto num : strides) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 2;
-  }
-
-  for (const auto num : paddings) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 3;
-  }
-
-  for (const auto num : dilations) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 4;
-  }
-
-  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
-          (seed << 6) + (seed >> 2) + 5;
-
-  if (seed == 0) return gen_func();
-
-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
-  }
-  return hash_[seed];
-}
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    int64_t area, int search_times, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  if (hash_.find(area) != hash_.end()) {
-    return hash_[area];
-  }
-  if (search_times_ < search_times) {
-    auto algo = gen_func();
-    hash_[area] = algo;
-    ++search_times_;
-    return algo;
-  }
-  TAlgorithm algo;
-  int64_t min = static_cast<uint64_t>(INT_MAX);
-  for (const auto& m : hash_) {
-    if (m.first < min) {
-      min = m.first;
-      algo = m.second;
-    }
-  }
-  return algo;
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index d8b997cca613f660046106512fc03bf55f9b992d..64152829b4f000e545054e528edca33dfe96ec56 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
 using DataLayout = platform::DataLayout;
+using framework::AlgorithmsCache;
+
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
@@ -139,38 +141,21 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         }
         return fwd_perf_stat[0].algo;
       };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
       int search_times = ctx.Attr<int>("search_times");
       search_times = std::max(
           static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      // TODO(dangqingqing): Unify this if-else.
       if (search_times > 0) {
         // The searched algo will be cached by `search_times` times for
         // different input dimension. For other dimensions, select the algo
         // of closest area.
-        auto var_name = ctx.Inputs("AlgoCache")[0];
-        algo_cache =
-            ctx.scope()
-                .FindVar(var_name)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
-                                        search_func);
+        algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                       search_func);
       } else {
-        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
-        // all conv ops use the same kCUDNNFwdAlgoCache variable.
-        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-          algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        } else {
-          // TODO(qingqing) remove const_cast
-          algo_cache =
-              const_cast<framework::Scope*>(ctx.scope().parent())
-                  ->Var(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        }
-        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
-                                        dilations, 0, search_func);
+        algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                       dilations, 0, search_func);
       }
       VLOG(3) << "choose algo " << algo;
     }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index fd9f156d070bdb1990a2fc9c63305933050e5524..a37c8d3ccd9c3bb8fae8a5f198bc4db714301b68 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_MKLDNN
@@ -109,8 +110,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       "float16 can only be used when CUDNN is used");
   }
 
-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library, customized_type_value);
+  auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                      library, customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+  // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
+  // to false. It should be fixed and then here should only create if library
+  // is kCUDNN.
+  if (configs.empty()) {
+    std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p(
+        new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
+    configs.push_back(p);
+  }
+#endif
+  return type;
 }
 
 void Conv2DOpMaker::Make() {
@@ -410,9 +423,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
-                                 ctx.GetPlace(), layout_, library_,
-                                 customized_type_value);
+  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                      ctx.GetPlace(), layout_, library_,
+                                      customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  if (library_ == framework::LibraryType::kCUDNN) {
+    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+    if (configs.empty()) {
+      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
+          p(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
+      configs.push_back(p);
+
+      std::shared_ptr<
+          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
+          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
+      configs.push_back(p2);
+    }
+  }
+#endif
+  return type;
 }
 
 class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba..ab01bdf7ca8c5a369bd8838b1acc734364666992 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
                    "The ignore threshold to ignore confidence loss.")
         .SetDefault(0.7);
     AddComment(R"DOC(
-         This operator generate yolov3 loss by given predict result and ground
+         This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
          
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor box.
 
-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
+         should be as follows:
 
          $$
-         b_x = \sigma(t_x) + c_x
-         b_y = \sigma(t_y) + c_y
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
          b_w = p_w e^{t_w}
+         $$
+         $$
          b_h = p_h e^{t_h}
          $$
 
-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         In the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
 
          As for confidence score, it is the logistic regression value of IoU between
          anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
 
          In order to trade off box coordinate losses between big boxes and small 
          boxes, box coordinate losses will be mutiplied by scale weight, which is
-         calculated as follow.
+         calculated as follows.
 
          $$
          weight_{box} = 2.0 - t_w * t_h
          $$
 
-         Final loss will be represented as follow.
+         Final loss will be represented as follows.
 
          $$
          loss = (loss_{xy} + loss_{wh}) * weight_{box}
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 2632bfb6de1751982c5c836fc96bf57d5b2844d6..356eba6f86ad180c7d23bf7fa91eb5d455ff5f08 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <random>
 #include <string>
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
                   const std::vector<T>& x, const std::vector<T>& yref,
                   const typename jit::SeqPoolTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), 0);
+    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
     int w = yref.size();
     std::vector<T> y(w);
     const T* x_data = x.data();
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 3f110024b285d41ccfe305e35c8efca5ed5ee0fe..ca998826dd0118ab4b1ecc23bed8ef882f1bcc92 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.output_value = out_t.data<T>();
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      T cell_clip = 0.0;
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_value.output_value = nullptr;
       lstm_grad.state_active_grad = nullptr;
       int cur_batch_size = bend - bstart;
+      T cell_clip = 0.0;
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 7a62bc9f828e4d3485628747cdf52c60c5354144..2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(ctx->HasInput("C0"),
                      "Input(C0) of LSTMP operator should not be null after "
                      "Input(H0) provided.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
     }
 
     auto b_dims = ctx->GetInputDim("Bias");
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
               "This LoDTensor is obtained in the forward and used in the "
               "backward.")
         .AsIntermediate();
-    AddOutput("OrderedP0",
-              "(Tensor) the projection of the initial hidden state "
-              "H0. This is a tensor with shape (N x P), where N is the "
-              "batch size and P is the hidden size.")
-        .AsIntermediate();
     AddAttr<bool>("use_peepholes",
                   "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTMP.")
         .SetDefault(false);
+    AddAttr<float>("cell_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for cell state tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
+    AddAttr<float>("proj_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for projection tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
     AddAttr<std::string>(
         "gate_activation",
         "(string, default: sigmoid)"
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 1f11e57dcb721012c7b8e50d7e138355685053da..c7d6e4205f8862526904e4fa767a2f4c4a2d8481 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -21,17 +22,50 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
+using platform::Transform;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T>
+class _ClipFunctor {
+ public:
+  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class _ClipGradFunctor {
+ public:
+  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src,
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto* bias = ctx.Input<Tensor>("Bias");
 
     auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
     auto* cell_t0 = ctx.Input<Tensor>("C0");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
     batch_gate->mutable_data<T>(ctx.GetPlace());
     auto* proj_out = ctx.Output<LoDTensor>("Projection");
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     }
     lstmp_value.prev_state_value = nullptr;
     Tensor ordered_c0;
+    Tensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTMP reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-
-        Tensor ordered_h0;
-        ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
-                    ordered_proj0, static_cast<T>(0.0));
-        if (proj_act != math::detail::ActivationType::kIdentity) {
-          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-          ActCompute(cell_act, place, proj0_dev, proj0_dev);
-        }
-        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
                     &gate_t, static_cast<T>(1.0));
       }
 
@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.state_value = cell_t.data<T>();
       lstmp_value.state_active_value = cell_pre_act_t.data<T>();
       math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
       blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                   &proj_t, static_cast<T>(0.0));
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
       }
+      if (proj_clip && proj_clip > 0.0) {
+        T* x_data = proj_t.data<T>();
+        int64_t numel = proj_t.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), x_data,
+              x_data + numel, x_data,
+              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
     }
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* proj_out = ctx.Input<LoDTensor>("Projection");
     auto* cell_out = ctx.Input<LoDTensor>("Cell");
 
+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
     auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
     auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
     auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     auto* h0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
     auto* c0 = ctx.Input<Tensor>("C0");
 
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       Tensor cur_proj = batch_proj.Slice(bstart, bend);
       Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+
+      if (proj_clip && proj_clip > 0.0) {
+        T* dx_data = proj_g.data<T>();
+        T* x_data = cur_proj.data<T>();
+        int64_t numel = proj_g.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), dx_data,
+              dx_data + numel, x_data, dx_data,
+              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
+
       if (proj_act != math::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);
 
       if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
@@ -431,31 +480,14 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
           if (weight_g) {
-            blas.MatMul(*ordered_proj0, true, gate_g, false,
-                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                        weight_g, static_cast<T>(1.0));
           }
         }
         if (h0 && (h0_g || proj_weight_g)) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          Tensor proj0_g;
-          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
-          proj0_g.mutable_data<T>(ctx.GetPlace());
           blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &proj0_g, static_cast<T>(0.0));
-          if (proj_act != math::detail::ActivationType::kIdentity) {
-            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
-            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
-                           proj0_g_dev);
-          }
-          if (h0_g) {
-            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
-                        &ordered_h0_g, static_cast<T>(0.0));
-          }
-          if (proj_weight_g) {
-            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
-                        proj_weight_g, static_cast<T>(1.0));
-          }
+                      &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 4b6eef18d8b967af5f3a5df0dee750620e7e412a..d4837696241b8c4e3cca4f2afe872c6be559853c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,6 +39,7 @@ math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv DEPS cub)
 math_library(im2col)
+math_library(sample_prob)
 math_library(sampler)
 
 math_library(gru_compute DEPS activation_functions math_function)
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 2e3779ff0845294e71f27801049c010e0a585e6b..ad79c58063a8a12c703979fe32a8e671a5ade857 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -32,7 +32,8 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, ActivationType active_node,
+                                     int frame_size, T cell_clip,
+                                     ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state) {
   T r_value_in;
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                       LstmMetaGrad<T> grad, int frame_size,
-                                      ActivationType active_node,
+                                      T cell_clip, ActivationType active_node,
                                       ActivationType active_gate,
                                       ActivationType active_state) {
   T r_value_in;
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, ActivationType active_node,
+                                   int frame_size, T cell_clip,
+                                   ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state) {
 #ifdef __AVX__
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 
     op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
        &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                     LstmMetaGrad<T> grad, int frame_size,
-                                    ActivationType active_node,
+                                    T cell_clip, ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state) {
 #ifdef __AVX__
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
        &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
        &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
        &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
 
 template <class T, class Op>
 void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                     active_gate, active_state);
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                     active_node, active_gate, active_state);
   } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                       active_gate, active_state);
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                       active_node, active_gate, active_state);
   }
 }
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, ActivationType active_node,
+                       int frame_size, T cell_clip, ActivationType active_node,
                        ActivationType active_gate,
                        ActivationType active_state) {
   if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
-                                      active_gate, active_state);
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
+                                      active_node, active_gate, active_state);
   } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
                                         active_node, active_gate, active_state);
   }
 }
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index 2aecb69237fdf344ebc0bfe72d9c7c147f06358d..e0ca9e7f5b2f4a8bb837768d645b5103aa3e6760 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -31,7 +31,8 @@ namespace detail {
  */
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, ActivationType active_node,
+                              int batch_size, T cell_clip,
+                              ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
      &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     active_node, active_gate, active_state);
+     &cell_clip, active_node, active_gate, active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
                                LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, ActivationType active_node,
+                               int batch_size, T cell_clip,
+                               ActivationType active_node,
                                ActivationType active_gate,
                                ActivationType active_state) {
   const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
   op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
      &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
      &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
-     active_gate, active_state);
+     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
+     active_node, active_gate, active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
   dim3 threads;
   dim3 grid;
   if (batch_size == 1) {
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmForward<T, Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   } else {
     KeLstmForward<T, Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
         active_state);
   }
 }
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size,
+                       int frame_size, int batch_size, T cell_clip,
                        ActivationType active_node, ActivationType active_gate,
                        ActivationType active_state) {
   dim3 threads;
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
   if (batch_size == 1) {
     KeLstmBackward<T, Op,
                    /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   } else {
     KeLstmBackward<T, Op,
                    /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
   }
 }
 
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
index cbe73d62938d7c4c03a2c8731665260624417fd7..8149686c97a030b91e0c4de708b9abf07f83203d 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -29,7 +29,7 @@ class lstm {
  public:
   HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
                              T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO,
+                             T *checkI, T *checkF, T *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -37,6 +37,15 @@ class lstm {
     *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
     *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
     *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
     *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
     *state_atv = activation(*state, active_state);
     *output = (*value_og) * (*state_atv);
@@ -52,7 +61,7 @@ class lstm {
                              __m256 *value_fg, __m256 *value_og,
                              __m256 *prev_state, __m256 *state,
                              __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO,
+                             __m256 *checkF, __m256 *checkO, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
@@ -65,6 +74,13 @@ class lstm {
         active_gate);
     *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
                            _mm256_mul_ps(*prev_state, *value_fg));
+
+    if (*cell_clip > 0.0f) {
+      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
+      __m256 max = _mm256_set1_ps(*cell_clip);
+      *state = _mm256_min_ps(max, *state);
+      *state = _mm256_max_ps(min, *state);
+    }
     *value_og = activation(
         _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
     *state_atv = activation(*state, active_state);
@@ -86,15 +102,26 @@ class lstm {
                              T *prev_state, T *prev_state_grad, T *state,
                              T *state_grad, T *state_atv, T *output_grad,
                              T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad,
+                             T *checkFGrad, T *checkOGrad, T *cell_clip,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
     *grad_og =
         activation((*output_grad) * (*state_atv), *value_og, active_gate);
-    *state_grad +=
-        activation((*output_grad) * (*value_og), *state_atv, active_state) +
-        (*grad_og) * (*checkO);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
+      *state_grad +=
+          activation((*output_grad) * (*value_og), *state_atv, active_state) +
+          (*grad_og) * (*checkO);
+    }
+
     *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
     *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
     *grad_fg =
@@ -117,15 +144,24 @@ class lstm {
       __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
       __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
       __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
-      ActivationType active_gate, ActivationType active_state) {
+      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
+      ActivationType active_node, ActivationType active_gate,
+      ActivationType active_state) {
     *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
                           active_gate);
-    *state_grad =
-        _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
-                                 *state_atv, active_state),
-                      *state_grad);
-    *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = _mm256_set1_ps(0.0f);
+      } else {
+        *state_grad =
+            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
+                                     *state_atv, active_state),
+                          *state_grad);
+        *state_grad =
+            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+      }
+    }
     *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
                           active_node);
     *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
index b6882b4fd8e6db8592a282410888d5625bae742a..94bbcbb50670d9f0b11b77cf6a54a99c227521bf 100644
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cand_act, gate_act, cell_act);
+                               cell_clip, cand_act, gate_act, cell_act);
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
       value.state_active_value += frame_size;
@@ -45,13 +45,14 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
   static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     for (int b = 0; b < batch_size; b++) {
       detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cand_act, gate_act, cell_act);
+                                frame_size, cell_clip, cand_act, gate_act,
+                                cell_act);
 
       value.gate_value += frame_size * 4;
       value.state_value += frame_size;
diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu
index 1233000083d6efc31fcbc527e8e9efb83224b4e3..e7445d3d40ae92ff66e7d33a38bfdebfc8455f0a 100644
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cand_act, gate_act,
-                                cell_act);
+                                frame_size, batch_size, cell_clip, cand_act,
+                                gate_act, cell_act);
   }
 };
 
@@ -37,13 +37,13 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
   static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType& gate_act,
                       const detail::ActivationType& cell_act,
                       const detail::ActivationType& cand_act) {
     detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cand_act, gate_act,
-                              cell_act);
+                              frame_size, batch_size, cell_clip, cand_act,
+                              gate_act, cell_act);
   }
 };
 
diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h
index ca2f78e6f318ce39bd2272bbce20f6a6f98fe430..80af5639387aaf6a983365e13c3478353c27a617 100644
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                       const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
  public:
   static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                       LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
+                      T cell_clip, const detail::ActivationType &gate_act,
                       const detail::ActivationType &cell_act,
                       const detail::ActivationType &cand_act);
 };
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99aa318453eae161807353198a78e11085cd6237
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SampleWithProb<platform::CPUDeviceContext, float>;
+template class SampleWithProb<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f9391591560cc3f76ac67f43121c4b1cff90e12
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -0,0 +1,161 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <thrust/random.h>
+#include <thrust/sort.h>
+#include <iostream>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__device__ T gpu_adjust_prob(const T prob, const int num_samples,
+                             const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+class GPULogUniformSampler {
+ public:
+  __device__ int64_t Sample(float random, const int range,
+                            const float log_range) const;
+  __device__ float Probability(int64_t value, const float log_range) const;
+};
+
+__device__ int64_t GPULogUniformSampler::Sample(float random, const int range,
+                                                const float log_range) const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  const int64_t value = static_cast<int64_t>(exp(random * log_range)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range;
+}
+
+__device__ float GPULogUniformSampler::Probability(
+    int64_t value, const float log_range) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  return (log((value + 2.0) / (value + 1.0))) / log_range;
+}
+
+template <typename T>
+__global__ void SamplingCondidate(
+    const size_t n, const int num_tries, const int range, const float log_range,
+    const int num_true, const std::size_t num_samples,
+    const int64_t* label_data, int64_t* samples_data, T* probabilities_data) {
+  const int num_sampled_classes = num_true + num_samples;
+
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+  GPULogUniformSampler sampler;
+
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    int col_idx = idx % num_sampled_classes;
+    int row_idx = idx / num_sampled_classes;
+    if (col_idx < num_true) {
+      samples_data[idx] = label_data[row_idx * num_true + col_idx];
+    } else {
+      samples_data[idx] = samples_data[col_idx];
+    }
+    probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range);
+    probabilities_data[idx] =
+        gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries);
+  }
+}
+
+template <typename T>
+int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
+                int64_t* samples_data) {
+  // sample num_samles unique samples for an example, note that they are not
+  // all negative samples
+  std::unordered_set<int64_t> tmp_samples;
+  tmp_samples.clear();
+  int num_tries = 0;
+  int j = 0;
+  while (j < num_samples) {
+    ++num_tries;
+    auto v = sampler.Sample();
+    auto insert_ok = tmp_samples.insert(v).second;
+    if (!insert_ok) {
+      continue;
+    }
+    samples_data[j] = v;
+    ++j;
+  }
+  return num_tries;
+}
+
+template <typename T>
+void GPUSampleWithProb<T>::operator()(
+    const platform::CUDADeviceContext& context, const int seed,
+    const int dict_size, const bool uniq, const std::size_t num_samples,
+    const Tensor* L, Tensor* S, Tensor* P) {
+  // UNDERSTAND: dimension issues
+  const auto lbl_dim = L->dims();
+  const int batch_size = lbl_dim[0];
+  const int num_true = lbl_dim[1];
+  const int num_sampled_classes = num_true + num_samples;
+  framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+  // UNDERSTAND: raw data view
+  const int64_t* label_data = L->data<int64_t>();
+  int64_t* samples_data = S->data<int64_t>();
+  T* probabilities_data = P->data<T>();
+
+  int s_size = num_samples;
+  framework::DDim s_dim{s_size};
+  Tensor s;
+  int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
+
+  math::LogUniformSampler sampler(dict_size, seed);
+
+  int range = dict_size;
+  float log_range = log(range + 1);
+
+  int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
+  VLOG(1) << "num_tries: " << num_tries;
+  PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data,
+                            sizeof(int64_t) * num_samples,
+                            cudaMemcpyHostToDevice));
+
+  int threads = 512;
+  const size_t size = batch_size * num_sampled_classes;
+  int grid = (batch_size * num_sampled_classes + threads - 1) / threads;
+  SamplingCondidate<T><<<grid, threads, 0, context.stream()>>>(
+      size, num_tries, range, log_range, num_true, num_samples, label_data,
+      samples_data, probabilities_data);
+}
+
+template class GPUSampleWithProb<float>;
+template class GPUSampleWithProb<double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5a6d84cb2b0527c606e62a19ef02d669945ecb1
--- /dev/null
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+/* UNDERSTAND: utility function to adjust probability for unique sampling,
+return whatever as it is if not using unique samping */
+template <typename T>
+static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SampleWithProb {
+ public:
+  void operator()(const DeviceContext& context, const Sampler& sampler,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P) {
+    // UNDERSTAND: dimension issues
+    const auto lbl_dim = L->dims();
+    const int batch_size = lbl_dim[0];
+    const int num_true = lbl_dim[1];
+    const int num_sampled_classes = num_true + num_samples;
+    framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+    // UNDERSTAND: raw data view
+    const int64_t* label_data = L->data<int64_t>();
+    int64_t* samples_data =
+        S->mutable_data<int64_t>(ret_dim, context.GetPlace());
+    T* probabilities_data = P->mutable_data<T>(ret_dim, context.GetPlace());
+
+    // temp sets for unique sampling
+    std::unordered_set<int64_t> tmp_samples;
+    int j = 0;  // column index
+    // add true labels, not that efficient
+    while (j < num_true) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        auto v = label_data[i * num_true + j];
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = sampler.Probability(v);
+      }
+      ++j;
+    }
+
+    // sample num_samles unique samples for an example, note that they are not
+    // all negative samples
+    tmp_samples.clear();
+    int num_tries = 0;
+    while (j < num_sampled_classes) {
+      ++num_tries;
+      auto v = sampler.Sample();
+      auto insert_ok = tmp_samples.insert(v).second;
+      if (!insert_ok) {
+        continue;
+      }
+      auto p = sampler.Probability(v);
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = p;
+      }
+      ++j;
+    }
+
+    // compute Q(y|x), because of unique sampling, probabilities need to be
+    // adjusted
+    for (int k = 0; k < num_sampled_classes; ++k) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + k;
+        probabilities_data[samples_index] = adjust_prob(
+            probabilities_data[samples_index], num_samples, num_tries);
+      }
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class GPUSampleWithProb {
+ public:
+  void operator()(const platform::CUDADeviceContext& context, const int seed,
+                  const int dict_size, const bool uniq,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P);
+};
+#endif
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index e16b6f78d16ce29cc493c4c795c7fe97a4bf2550..5b7505f3c4acdef94fead04efd00b47825274117 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
                    "Wrong layout/format set for Input x tensor");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
         "is_test attribute should be set to False in training phase.");
 
     Functor functor;
-
-    auto attrs = functor.GetAttrs();
-    for (auto &attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
     functor(ctx);
   }
 };
@@ -235,7 +225,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
       std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
   PADDLE_ENFORCE(src_memory != nullptr,
                  "Fail to find src_memory in device context");
-  src_memory->set_data_handle(*p_src_data.get());
+  src_memory->set_data_handle(*p_src_data);
 
   std::shared_ptr<memory> diff_src_memory;
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 0ce174654e85175f0b949f860a00afafc548ed3e..7ac64e6ba134c034acc58c7310cd51da0f03d16d 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
-                       input->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Input tensor");
-    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
-                       filter->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
     PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
                    "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
     PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     std::vector<primitive> pipeline;
 
-    auto src_format = input->format();
-    mkldnn::memory::format weights_format =
-        GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
+    // For convolution with groups we need to recreate primitive descriptor
+    // as Paddle tensor is not having group dims while mkldnn treats
+    // group as another dimensions
+    mkldnn::memory::primitive_desc user_weights_mpd =
+        filter->get_mkldnn_prim_desc();
+    if (g > 1) {
+      mkldnn::memory::format weights_format =
+          GetWeightsFormat(filter->format(), g, is_conv3d);
+      auto user_weights_md = platform::MKLDNNMemDesc(
+          {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
+      user_weights_mpd =
+          mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
+    }
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    weights_format = mkldnn::memory::format::any;
+    mkldnn::memory::format weights_format = mkldnn::memory::format::any;
     // Check the format for user's special output
     if (chosen_memory_format != mkldnn::memory::format::any) {
       if (is_conv3d) {
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
     auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
+        user_weights_mpd, to_void_cast<T>(filter_data));
 
     // create reorder primitive if the input format is not the preferred one
     auto src_memory_p =
@@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+    auto dst_mpd = dst_memory_p->get_primitive_desc();
+    output->set_mkldnn_prim_desc(dst_mpd);
   }
   void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       // push primitive to stream and wait until it's executed
       pipeline.push_back(*conv_bwd_weights_p);
 
-      filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
+      filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
     }
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 76b00b396c1349eff5db1059268e7cf280a8fc64..d01e8dbf4ce0c92bb81fc76df68d5424f9da0717 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     // The format of output is set as the mkldnn's format
     // TODO(@mozga-intel) The format of matrix sets inside the another layers.
-    tensor->set_layout(DataLayout::kMKLDNN);
-    tensor->set_format(mkldnn::memory::format::oihw);
+    // TODO(jczaja): Remove this hack after checking performance on block layout
+
+    auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(tensor->dims()),
+        mkldnn::memory::format::oihw);
+    tensor->set_mkldnn_prim_desc(tensor_mem_pd);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index f4bad7b712b2b078ed68f0a3d0e751d9ae2d6191..38a65b50bd22354bea54819e8e71015202e96e9f 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -198,7 +198,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
+    std::vector<mkldnn::primitive> pipeline{*pool_p};
     stream(stream::kind::eager).submit(pipeline).wait();
 
     output->set_layout(DataLayout::kMKLDNN);
@@ -367,8 +367,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
 
       pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
-          *(diff_src_memory));
+          pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory);
       dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
 
     } else {
@@ -404,7 +403,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     if (is_diff_dst_reordered) {
       pipeline.push_back(reorder_diff_dst);
     }
-    pipeline.push_back(*(pool_bwd_p.get()));
+    pipeline.push_back(*pool_bwd_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
     in_x_grad->set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index d2b149535426d097fea4b8fffa9efe82bd6edc64..dc1176f0848b93dd6872f676c3a71dab4f3455fd 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                    "Fail to find softmax primitive in device context");
     if (softmax_p == nullptr) {
       softmax_p = std::make_shared<mkldnn::softmax_forward>(
-          *(softmax_pd_.get()),
-          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *softmax_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
           *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
       dev_ctx_.SetBlob(prim_key, softmax_p);
     } else {
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                    "Fail to find softmax backward primitive in device context");
     if (softmax_bwd_p == nullptr) {
       softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
-          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
-          *(diff_src_memory_p.get()));
+          *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p,
+          *diff_src_memory_p);
       dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
     } else {
       is_reusing_ = true;
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index c39f94637a1abb5bfce9a5428419282f2b870c91..fe4131df2c77ed28cd36f23002d000dac3e8a129 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -160,7 +160,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       auto get_selected_row = [&](size_t i) -> const SelectedRows& {
         if (i == 0 && in0) {
-          return *in0.get();
+          return *in0;
         } else {
           return in_vars[i]->Get<SelectedRows>();
         }
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index e6df7028f540d0928e2bb0763bd4cfef12059665..e41bfb80dfc0452955f7978f74ccfea184886b69 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                              mkldnn_engine, key);
 
     auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->format(), platform::to_void_cast<T>(input_data));
+        input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(output, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(output->dims()),
+        mkldnn::memory::format::blocked);
+    output->set_mkldnn_prim_desc(output_mem_pd);
   }
 };
 
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
                                              mkldnn_engine, key);
 
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_src_memory_p =
+        handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(),
+                                 platform::to_void_cast<T>(out_grad_data));
     auto transpose_dst_memory_p =
         handler.AcquireDstMemory(x_grad, ctx.GetPlace());
     auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<mkldnn::primitive> pipeline;
     pipeline.push_back(*transpose_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(x_grad->dims()),
+        mkldnn::memory::format::blocked);
+    x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
index 6b256ef02666c21ec1db3f6922b56bb23363b4a0..7559d29ce233dfcebf8b3118b4c700c35fe15d32 100644
--- a/paddle/fluid/operators/ngraph/CMakeLists.txt
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
   cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
   cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
   op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+  add_subdirectory(ops)
 endif()
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 4bfcba6c3ce312e21e281e32fe1cb92ef45fda6f..996376c53f07b5c26eccad382e734f187f75f5a1 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -19,50 +19,21 @@ limitations under the License. */
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
 namespace operators {
 
-namespace NG_OPS = paddle::operators::ngraphs;
-std::map<std::string,
-         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                            std::shared_ptr<std::unordered_map<
-                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {
-        {"accuracy", NG_OPS::BuildAccuracyNode},
-        {"conv2d", NG_OPS::BuildConv2dNode},
-        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
-        {"batch_norm", NG_OPS::BuildBatchNormNode},
-        {"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
-        {"cross_entropy", NG_OPS::BuildCrossEntropyNode},
-        {"cross_entropy_grad", NG_OPS::BuildCrossEntropyGradNode},
-        {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
-        {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", NG_OPS::BuildFillConstantNode},
-        {"mean", NG_OPS::BuildMeanNode},
-        {"mean_grad", NG_OPS::BuildMeanGradNode},
-        {"momentum", NG_OPS::BuildMomentumNode},
-        {"mul", NG_OPS::BuildMulNode},
-        {"mul_grad", NG_OPS::BuildMulGradNode},
-        {"pool2d", NG_OPS::BuildPool2dNode},
-        {"pool2d_grad", NG_OPS::BuildPool2dGradNode},
-        {"softmax", NG_OPS::BuildSoftmaxNode},
-        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
-        {"scale", NG_OPS::BuildScaleNode},
-        {"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
-        {"sum", NG_OPS::BuildSumNode},
-        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
-        {"relu_grad", NG_OPS::BuildReluGradNode},
-        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
-        {"tanh_grad", NG_OPS::BuildTanhGradNode},
-        {"top_k", NG_OPS::BuildTopKNode}};
+bool NgraphBridge::isRegister(const std::string& str) {
+  return ops::NgraphSingleton::Lookup(str);
+}
 
 void NgraphBridge::BuildNgNode(
     const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map_);
+  ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
index c57988f8f6322e76678c572aa21ff5b17b9e3c22..952d5b0b4362aa1c1112782885ab5d30698f5cff 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -28,13 +28,6 @@ namespace operators {
 
 class NgraphBridge {
  public:
-  static std::map<
-      std::string,
-      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
-                         std::shared_ptr<std::unordered_map<
-                             std::string, std::shared_ptr<ngraph::Node>>>)>>
-      NG_NODE_MAP;
-
   explicit NgraphBridge(
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -43,6 +36,8 @@ class NgraphBridge {
 
   void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 
+  static bool isRegister(const std::string& str);
+
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index bec4b514a218715134d2366dd7efd7cf5b377b68..660a3298cbe4bf5d83851a916bb3ea8d260214a3 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int pivot = left;
   while (pivot < right) {
     auto op_type = ops.at(pivot)->Type();
-    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        NgraphBridge::NG_NODE_MAP.end()) {
+    if (NgraphBridge::isRegister(op_type)) {
       ++pivot;
     } else {
       int start = pivot, end = start;
       while (pivot < right &&
-             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
-              NgraphBridge::NG_NODE_MAP.end())) {
+             (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
         ++pivot;
         ++end;
       }
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
deleted file mode 100644
index 8edb4dd2a10787c334e0c630a54fc9b833ac0e60..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the list of the ngraph operators for Paddle.
- *
- * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include "ops/accuracy_op.h"
-#include "ops/activation_op.h"
-#include "ops/batch_norm_op.h"
-#include "ops/binary_unary_op.h"
-#include "ops/conv2d_op.h"
-#include "ops/cross_entropy_op.h"
-#include "ops/elementwise_add_op.h"
-#include "ops/fill_constant_op.h"
-#include "ops/mean_op.h"
-#include "ops/momentum_op.h"
-#include "ops/mul_op.h"
-#include "ops/pool2d_op.h"
-#include "ops/scale_op.h"
-#include "ops/softmax_op.h"
-#include "ops/sum_op.h"
-#include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/CMakeLists.txt b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7dee3308b74a70a2daf35055d3ac80a14de99ac1
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
+file(APPEND ${pass_file} "\#pragma once\n")
+file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+foreach(OPS_NAME ${LIST_OPS})
+    file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
+endforeach(OPS_NAME)
diff --git a/paddle/fluid/operators/ngraph/ops/accuracy_op.h b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
index bf37ce48d8c2ce3b97fac154be9d1dfb08421f97..d90ec97298b0f6fb8480e97ca57cb427784261e4 100644
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -63,3 +64,5 @@ void BuildAccuracyNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(accuracy, BuildAccuracyNode);
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
index f66080e3aabc05d3ce5ecaa3791de4410e34fa37..d1b0b80d227a5042219a17e35255617726aa8042 100644
--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -50,3 +51,6 @@ void BuildTanhGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(relu_grad, BuildReluGradNode);
+REGISTER_NG_OP(than_grad, BuildTanhGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
index f0d2d5f27f81c1cc5b0774478ea00e5350049ac2..2d638bb53f084ee75014d64302ec3d86b3bcf26f 100644
--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -155,3 +156,6 @@ void BuildBatchNormGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
+REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
index 0c0d25d0cd1ae536618057ce80388b8eeb81c68a..375f188286c123b1d652f8780989404760c8e1a4 100644
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -47,3 +48,7 @@ static void BuildUnaryNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
+REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
+REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
index 46fb2703f51482afa0546f08b8fc7b2c98e281bc..d664825c53ebf17435a0ec532969978abe6d30ca 100644
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(conv2d, BuildConv2dNode);
+REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
index f88a2cb94103b2305875655cefaec16263b4bf4f..3ab158f3e13a33bdb7e423919c7592831fa9831a 100644
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
+REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
index 868df51e16a9714a750bac64dadc3441de79165e..fb796c336a9b45966a0ff703286faa8b61752483 100644
--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
+REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
index 58783bc220fa02879aaebacc5ac55a07b13184f1..bc958f2ba27cf929408d56d41bf22976caf7d6ae 100644
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -55,3 +56,5 @@ void BuildFillConstantNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h
index 4c44bc4c112f401c2707f7babd49a33f238a768f..f839d9978d71c2967a7f2c2f22622dc615907831 100644
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -64,3 +65,6 @@ void BuildMeanGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mean, BuildMeanNode);
+REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/momentum_op.h b/paddle/fluid/operators/ngraph/ops/momentum_op.h
index f1b365c488d31c437ec17f5872c2c739e00a86a7..b8291a08a28b585a7ceb67642ba28c3314195790 100644
--- a/paddle/fluid/operators/ngraph/ops/momentum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -99,3 +100,5 @@ void BuildMomentumNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(momentum, BuildMomentumNode);
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
index 4a6cbebe245f891c6c33b2116330a41d89d50e25..98c70a1a99aa899ed8fdd3c4674668cefd14c4ae 100644
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -130,3 +131,6 @@ static void BuildMulGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(mul, BuildMulNode);
+REGISTER_NG_OP(mul_grad, BuildMulGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/op_bridge.h b/paddle/fluid/operators/ngraph/ops/op_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..93df0ad8062745380d9cd4ca5027bef1425083bf
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/op_bridge.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "ngraph/node.hpp"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace ops {
+
+class NgraphSingleton {
+  NgraphSingleton() = default;
+  NgraphSingleton(NgraphSingleton const&) = delete;
+  void operator=(NgraphSingleton const) = delete;
+
+  ~NgraphSingleton() = default;
+
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      ng_node_maps_;
+
+ public:
+  template <typename TF>
+  static void Register(TF&& tf, const std::string& name) {
+    ng_node_maps_[name] = tf;
+  }
+
+  static bool Lookup(const std::string& name) {
+    auto it = ng_node_maps_.find(name);
+    if (it == ng_node_maps_.end()) {
+      return true;
+    }
+    return false;
+  }
+
+  static void BuildNode(
+      const std::shared_ptr<std::unordered_map<
+          std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
+      const std::shared_ptr<framework::OperatorBase>& op,
+      const std::string& name) {
+    ng_node_maps_[name](op, ng_maps);
+  }
+};
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphSingleton::ng_node_maps_;
+
+}  // namespace ops
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_NG_OP(op_type__, Converter__)                  \
+  struct ng_##op_type__##_converter {                           \
+    ng_##op_type__##_converter() {                              \
+      paddle::operators::ops::NgraphSingleton::Register(        \
+          paddle::operators::ngraphs::Converter__, #op_type__); \
+    }                                                           \
+  };                                                            \
+  ng_##op_type__##_converter ng_##op_type__##_converter__;
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
index 836c9d6c185b305d3dd4c9e9d30e23abb0c1431c..a6371372ef10c093c41153cb0dc73f4f9e95687f 100644
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(pool2d, BuildPool2dNode);
+REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h
index 91a57d0be606373e985a30b7ac9c73648062d8e4..a334192419f572c429f5842cd9e418d8945eb0ef 100644
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -37,3 +38,5 @@ void BuildScaleNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(scale, BuildScaleNode);
diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h
index fc6395c08bc6b00990679c5327c3152a980be821..1df6418de06d000892d2802596df61320fcdc759 100644
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(softmax, BuildSoftmaxNode);
+REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/top_k_op.h b/paddle/fluid/operators/ngraph/ops/top_k_op.h
index 852ecd7139a3c7046e78265ca021b2ce286c63c0..6d10faa7c2efb9cbd87fa8ef1c6ecb4fa350d8f6 100644
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -42,3 +43,5 @@ void BuildTopKNode(
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
+
+REGISTER_NG_OP(top_k, BuildTopKNode);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index fc3636e0b24765f681d3260b07fe854309774a40..0a0ece162cc63696974383d8ed49fdd10204c331 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("global_pooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If global_pooling = true, ksize and paddings will be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, kernel size and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default {1, 1}), strides(height, "
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
       "paddings",
       "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
-      "If global_pooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and kernel size will be ignored.")
       .SetDefault({0, 0});
   AddAttr<bool>(
       "exclusive",
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used.")
       .SetDefault(false);
@@ -262,28 +263,37 @@ Example:
   For exclusive = false:
        $$
        hstart = i * strides[0] - paddings[0]
+       $$
+       $$
        hend = hstart + ksize[0]
+       $$
+       $$
        wstart = j * strides[1] - paddings[1]
+       $$
+       $$
        wend = wstart + ksize[1]
+       $$
+       $$
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
        $$
+
   For exclusive = true:
        $$
        hstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
        hend = min(H, hstart + ksize[0])
+       $$
+       $$
        wstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
        wend = min(W, wstart + ksize[1])
+       $$
+       $$
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
        $$
 
-  For adaptive = true:
-      $$
-      hstart = floor(i * H_{in} / H_{out})
-      hend = ceil((i + 1) * H_{in} / H_{out})
-      wstart = floor(j * W_{in} / W_{out})
-      wend = ceil((j + 1) * W_{in} / W_{out})
-      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-      $$
 )DOC");
 }
 
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
   AddAttr<bool>(
       "global_pooling",
       "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, ksize and paddings wille be ignored.")
+      "If global_pooling = true, kernel size and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>(
       "strides",
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>(
       "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used.")
       .SetDefault(false);
@@ -392,48 +402,68 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
   For ceil_mode = false:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
   For ceil_mode = true:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       $$
+
   For exclusive = false:
-  $$
-  dstart = i * strides[0] - paddings[0]
-  dend = dstart + ksize[0]
-  hstart = j * strides[1] - paddings[1]
-  hend = hstart + ksize[1]
-  wstart = k * strides[2] - paddings[2]
-  wend = wstart + ksize[2]
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
-  $$
+       $$
+       dstart = i * strides[0] - paddings[0]
+       $$
+       $$
+       dend = dstart + ksize[0]
+       $$
+       $$
+       hstart = j * strides[1] - paddings[1]
+       $$
+       $$
+       hend = hstart + ksize[1]
+       $$
+       $$
+       wstart = k * strides[2] - paddings[2]
+       $$
+       $$
+       wend = wstart + ksize[2]
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+       $$
+
   For exclusive = true:
-  $$
-  dstart = max(0, i * strides[0] - paddings[0])
-  dend = min(D, dstart + ksize[0])
-  hstart = max(0, j * strides[1] - paddings[1])
-  hend = min(H, hstart + ksize[1])
-  wstart = max(0, k * strides[2] - paddings[2])
-  wend = min(W, wstart + ksize[2])
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
-
-  For adaptive = true:
-  $$
-  dstart = floor(i * D_{in} / D_{out})
-  dend = ceil((i + 1) * D_{in} / D_{out})
-  hstart = floor(j * H_{in} / H_{out})
-  hend = ceil((j + 1) * H_{in} / H_{out})
-  wstart = floor(k * W_{in} / W_{out})
-  wend = ceil((k + 1) * W_{in} / W_{out})
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
+       $$
+       dstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
+       dend = min(D, dstart + ksize[0])
+       $$
+       $$
+       hend = min(H, hstart + ksize[1])
+       $$
+       $$
+       wstart = max(0, k * strides[2] - paddings[2])
+       $$
+       $$
+       wend = min(W, wstart + ksize[2])
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+       $$
 
 )DOC");
 }
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7f7fb26b17c77e6fe87646d3cac20c02c49b52c
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sample_logits_op.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+
+class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.");
+    AddInput("Labels",
+             "(Tensor) The ground truth which is a 2-D tensor. Labels is a "
+             "Tensor<int64> with shape [N x NT], where NT is the number of"
+             "true labels for each example.");
+    AddInput("CustomizedSamples",
+             "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+             "NT + S],"
+             " where N is the batch size, NT is the number of true labels "
+             "and S is the number of negtive sample for each example."
+             "The first NT elements of each row should be the same with true "
+             "labels, "
+             "followed by S custom negtive samples. This tensor"
+             "is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddInput(
+        "CustomizedProbabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The tensor has the same shape with CustomSamples,"
+        "and each element represents probability of element in CustomSamples. "
+        "This "
+        "tensor is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddOutput("Samples",
+              "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+              "NT + S]."
+              "The outputs value of sampler, including NT true lables and S "
+              "negetive samples "
+              "for each example. This will be used in"
+              "backward calculation.")
+        .AsIntermediate();
+    AddOutput(
+        "Probabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The probabilites of sampled positive and negtive labels.")
+        .AsIntermediate();
+    AddOutput("SampledLogits",
+              "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
+              "[N, NT + S]. The outputs value of sampled logits, which will be"
+              "used in backward propagation.")
+        .AsIntermediate();
+    AddOutput(
+        "SampledLabels",
+        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
+        "with shape [N, NT]. The tonsor contains hard labels as input to "
+        " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
+        " of Sampels are positive lables.");
+    AddAttr<bool>(
+        "use_customized_samples",
+        "An indicator whether to use customized samples with probabilities, if "
+        "True"
+        "the operator will use customized samples and customized probabilities"
+        "otherwise, the operator will generate them by itself.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "uniq",
+        "An indicator whether to sample non-repetitive negtive labels, if True"
+        "the operator will sample negtive labels without replacement."
+        "Otherwise, the operator will sample negtive labels with replacement.")
+        .SetDefault(true);
+    AddAttr<bool>(
+        "remove_accidental_hits",
+        "An indicator whether to remove accidental hits when samples hits true"
+        "labels, the removal is implemented by subtracting the corresponding"
+        "logits by float_max to subpress their softmax to be zero.")
+        .SetDefault(true);
+    AddAttr<int>("num_samples", "The number of negative samples.");
+    AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
+
+    AddComment(R"DOC(
+  """
+  Computes sampled output training logits and labels suitable for implementing
+  sampled softmax.        
+  """
+
+)DOC");
+  }
+};
+
+class SampleLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Samples"),
+                   "Output(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Probabilities"),
+                   "Output(Probabilities) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLogits"),
+                   "Output(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
+                   "Output(SampledLabels) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(
+        logits_dims.size(), 2UL,
+        "The logits of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    const int num_samples = ctx->Attrs().Get<int>("num_samples");
+    const int num_sampled_classes = labels_dims[1] + num_samples;
+    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits"));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: InferShape for Grad
+class SampleLogitsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Samples"),
+                   "Input(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
+                   "Input(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
+                   "Input(SampledLogits@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto logit_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "The label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
+                      "The logits should be a 2-D tensor.");
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("SampledLogits")));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: what's the rule for making a GradMaker TODO
+class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("sample_logits_grad");
+    grad_op->SetInput("Logits", Input("Logits"));
+    grad_op->SetInput("Labels", Input("Labels"));
+    grad_op->SetInput("Samples", Output("Samples"));
+    grad_op->SetInput("SampledLogits", Output("SampledLogits"));
+    grad_op->SetInput(framework::GradVarName("SampledLogits"),
+                      OutputGrad("SampledLogits"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker,
+                  ops::SampleLogitsGradMaker);
+REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
+REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel<float>,
+                       ops::SampleLogitsKernel<double>);
+REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel<float>,
+                       ops::SampleLogitsGradKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb49793b730f72d66dc846f233bd95ebdab37c52
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/sample_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+__global__ void GPUTakeAlongD1(size_t size, const int batch_size,
+                               const int array_slice_size,
+                               const int idx_slice_size, const T* p_array,
+                               const int64_t* p_index, T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    auto array_index = p_index[idx];
+    p_value[idx] = p_array[i * array_slice_size + array_index];
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+__global__ void GPUPutAlongD1(size_t size, const int batch_size,
+                              const int array_slice_size,
+                              const int idx_slice_size, T* p_array,
+                              const int64_t* p_index, const T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  // size == batch_size
+  for (; idx < size; idx += step_size) {
+    int i = idx;
+    for (int j = 0; j < idx_slice_size; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * idx_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: set label as 0,1,...,num_true-1
+template <typename T>
+__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    p_array[idx] = idx % num_true;
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+__global__ void gpu_compute_remove_accidental_hits(const int size,
+                                                   const int num_true,
+                                                   const int idx_slice_size,
+                                                   const int64_t* p_index,
+                                                   T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    if (idx % idx_slice_size < num_true) continue;
+    for (int j = 0; j < num_true; ++j) {
+      const auto true_idx = i * idx_slice_size + j;
+      if (p_index[true_idx] == p_index[idx]) {
+        p_value[idx] -= 1e20;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+    VLOG(3) << "Enter SampleLogitsCUDAKernel";
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool uniq = context.Attr<bool>("uniq");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx = context.cuda_device_context();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
+
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    int threads = 512;
+    size_t size = batch_size * num_true;
+    int grid = (size + threads - 1) / threads;
+    GPUSetLabel<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, num_true, sampled_labels_data);
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob = math::GPUSampleWithProb<T>();
+      sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    const auto num_take = samples->dims()[1];
+    const auto array_dims = logits->dims();
+    const auto idx_dims = samples->dims();
+
+    const T* p_array = logits->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    T* p_value = sampled_logits->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    size = batch_size * num_take;
+    grid = (size + threads - 1) / threads;
+    GPUTakeAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+
+    if (remove_accidental_hits) {
+      const size_t size = batch_size * (num_true + num_samples);
+      int grid = (size + threads - 1) / threads;
+      gpu_compute_remove_accidental_hits<
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+          size, num_true, idx_slice_size, p_index, p_value);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    const auto batch_size = samples->dims()[0];
+    const auto num_put = samples->dims()[1];
+    const auto array_dims = logits_grad->dims();
+    const auto idx_dims = samples->dims();
+
+    T* p_array = logits_grad->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    const T* p_value = sampled_logits_grad->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    int threads = 128;
+    const size_t size = batch_size;
+    int grid = (size + threads - 1) / threads;
+
+    GPUPutAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
+                        ops::SampleLogitsCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(sample_logits_grad,
+                        ops::SampleLogitsGradCUDAKernel<float>,
+                        ops::SampleLogitsGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b55a24863cc09d5f80e07aedbbb5b3d9ac99e69e
--- /dev/null
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
+                           const framework::Tensor& array,
+                           const framework::Tensor& index,
+                           framework::Tensor* value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 &&
+                 index.dims()[0] == array.dims()[0] &&
+                 index.dims() == value->dims());
+
+  const auto batch_size = index.dims()[0];
+  const auto num_take = index.dims()[1];
+  const auto array_dims = array.dims();
+  const auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  const T* p_array = array.data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  T* p_value = value->data<T>();
+
+  // src slice size
+  const auto array_slice_size = array_dims[1];
+
+  // index slice size
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_take; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_value[i * value_slice_size + j] =
+          p_array[i * array_slice_size + array_index];
+    }
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+static void CPUPutAlongD1(const platform::DeviceContext& ctx,
+                          framework::Tensor* array,
+                          const framework::Tensor& index,
+                          const framework::Tensor& value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 &&
+                 index.dims()[0] == array->dims()[0] &&
+                 index.dims() == value.dims());
+  const auto batch_size = index.dims()[0];
+  const auto num_put = index.dims()[1];
+  auto array_dims = array->dims();
+  auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  T* p_array = array->data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  const T* p_value = value.data<T>();
+
+  // slice sizes
+  const auto array_slice_size = array_dims[1];
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_put; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * value_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
+                                           framework::Tensor* sampled_logits,
+                                           const framework::Tensor& samples,
+                                           const int num_true) {
+  const auto batch_size = sampled_logits->dims()[0];
+  const auto num_sampled_classes = sampled_logits->dims()[1];
+  T* sampled_logits_data = sampled_logits->data<T>();
+  const auto samples_data = samples.data<int64_t>();
+
+  std::unordered_set<int64_t> tmp_true_labels;
+  for (int i = 0; i < batch_size; ++i) {
+    tmp_true_labels.clear();
+    tmp_true_labels.insert(samples_data + i * num_sampled_classes,
+                           samples_data + i * num_sampled_classes + num_true);
+    for (int j = num_true; j < num_sampled_classes; ++j) {
+      const auto idx = i * num_sampled_classes + j;
+      if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
+        sampled_logits_data[idx] -= 1e20;
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    VLOG(3) << "Enter SampleLogitsKernel";
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    for (int i = 0; i < batch_size; ++i) {
+      for (int j = 0; j < num_true; ++j) {
+        sampled_labels_data[i * num_true + j] = j;
+      }
+    }
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob =
+          math::SampleWithProb<platform::CPUDeviceContext, T>();
+      sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed),
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
+    if (remove_accidental_hits) {
+      compute_remove_accidental_hits<T>(dev_ctx, sampled_logits, *samples,
+                                        num_true);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5833fee35b14d67727c7d155b60223fd32de439b..1838506c8931b2e1ff82adf6f277925dc9d53374 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -89,9 +89,9 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
+    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
 else()
-    cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
+    cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
 endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index ed0dbdeb13ce93926c023f9f435776f1a1839933..920b43b2b1990af58b73888bf7a652d57c20563c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
 
   int tid = platform::get_cur_thread_id();
 
-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);
 
   // Find KeyBlob for current thread
   auto map_it = pMap->find(tid);
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
 
   int tid = platform::get_cur_thread_id();
 
-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);
 
   // Find KeyBlob for current thread firstly
   auto map_it = pMap->find(tid);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index f42212d09508ce4304b58e8b991230e2a45369dd..0179daa55715be9787bc7cc8a693319024d404b7 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -136,7 +136,7 @@ void EnableActivity() {
   CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
   CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
   // We don't track these activities for now.
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
@@ -155,7 +155,7 @@ void DisableActivity() {
   // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
   // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
   // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
   // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                 memcpy->correlationId, memcpy->bytes);
             break;
           }
+          case CUPTI_ACTIVITY_KIND_MEMSET: {
+            auto *memset =
+                reinterpret_cast<const CUpti_ActivityMemset *>(record);
+            tracer->AddKernelRecords("MEMSET", memset->start, memset->end,
+                                     memset->deviceId, memset->streamId,
+                                     memset->correlationId);
+            break;
+          }
           case CUPTI_ACTIVITY_KIND_DRIVER: {
             auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
             if (api->start != 0 && api->end != 0)
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
     const std::vector<int> cbids {
       CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
           CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
           CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
           CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
 #if CUDA_VERSION >= 9000
@@ -601,6 +611,8 @@ void initCuptiCbidStr() {
   REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
 #if CUDA_VERSION >= 9000
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 6ee2c36146215e278c87d22dd7b8fd5cb7e33865..d4418d836d66e329af8ed3f5ec05f49d47146b3e 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
 
-class Event;
-
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
 // 2. Collect cuda statistics: start/end ts, memory, etc.
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 54ad18a8e4abbddc0414ed32a719d46002e99188..bdb1d1bd3bf47ea89984587ae84d2aa84be232a4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include <type_traits>
 #include <utility>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4db23758b1c477114cd03dcd0e9f51296c575c6
--- /dev/null
+++ b/paddle/fluid/platform/event.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+enum EventType { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventType type, std::string name, uint32_t thread_id);
+
+  const EventType& type() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+
+#ifdef PADDLE_WITH_CUDA
+#ifndef PADDLE_WITH_CUPTI
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventType type_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_CUPTI
+  int64_t gpu_ns_ = 0;
+
+ public:
+  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
+    gpu_ns_ += end_ns - start_ns;
+  }
+
+ private:
+#else
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+#endif
+};
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 269280d604a13a62046fb7811d34b7c69b61b50f..4a674ca526f455314613d43847faa7e01f4d7802 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -39,6 +39,45 @@ class MKLDNNHandler {
     return this->AcquireMemory(md, ptr, "@user_src_mem_p");
   }
 
+  // TODO(jczaja): extract common part and make AcquireMemory
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_weights_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
       const mkldnn::memory::desc& md, void* ptr,
       user_function custom_func = {}) {
@@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
                          mkldnn::engine engine, const std::string& base_key)
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
         dims_(dims),
-        axis_(axis),
-        logical_axis_(dims.size(), 0) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::format& fmt, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   " find mem primitive in device context");
-    if (mem_p == nullptr) {
-      // Make memory descriptor using input format, unless it
-      // cannot be trusted (nchw) then make up memory fmt manually
-      for (size_t i = 0; i < logical_axis_.size(); ++i) {
-        logical_axis_[i] = i;
-      }
-      auto src_md = fmt != mkldnn::memory::format::nchw
-                        ? platform::MKLDNNMemDesc(
-                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
-                        : Axis2MemoryDesc(dims_, logical_axis_);
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
+        axis_(axis) {}
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
                                                    platform::Place place) {
@@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
  private:
   std::vector<int> dims_;
   std::vector<int> axis_;
-  std::vector<int> logical_axis_;
 };
 
 template <class forward_t, class backward_data_t, class backward_weights_t>
@@ -548,9 +556,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                    "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
-                                           *(weights_memory_p.get()),
-                                           *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
     } else {
@@ -570,9 +577,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                    "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *bias_memory_p,
+                                           *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
     } else {
diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c511f97d12cfe299ad5629eff1871e8d156c850
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_utils.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <mkldnn.h>
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
+    const std::vector<int>& ltz, mkldnn::memory::format fmt,
+    mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
+  mkldnn_memory_desc_t mem_fmt;
+
+  mem_fmt.primitive_kind = mkldnn_memory;
+  mem_fmt.ndims = ltz.size();
+  for (unsigned int i = 0; i < ltz.size(); ++i) {
+    mem_fmt.dims[i] = ltz[i];  // logical dimensions (nchw format,
+                               // regardless physical layout)
+  }
+  mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
+  mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
+
+  unsigned int total_stride = 1;
+  for (int i = ltz.size() - 1; i >= 0; --i) {
+    mem_fmt.layout_desc.blocking.padding_dims[i] =
+        ltz[i];  // logical dimensions (nchw format, regardless physical
+                 // layout)
+    mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+    mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+    mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
+    mem_fmt.layout_desc.blocking.strides[1][i] = 1;
+    total_stride *= ltz[i];
+  }
+  mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
+}
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
+    const std::vector<int>& ltz, const mkldnn::memory::format format,
+    const mkldnn::memory::data_type data_type) {
+  auto md = mkldnn::memory::desc({ltz}, data_type, format);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(md, cpu_engine);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 436654d1024a0bf72cb46d8a9d2650cf26f8f90e..9a285a6b533dcb48013e3b3e4d34dc27186173ac 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -112,12 +112,11 @@ double Event::CpuElapsedMs(const Event& e) const {
 }
 
 double Event::CudaElapsedMs(const Event& e) const {
-#ifdef PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_CUPTI
   return gpu_ns_ / 1000000.0;
-#endif
 #else
-  PADDLE_THROW("CUDA is not enabled");
+  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
+  return 0;
 #endif
 }
 
@@ -255,9 +254,11 @@ struct EventItem {
   std::string name;
   int calls;
   double total_time;
-  double min_time;
   double max_time;
   double ave_time;
+  double min_time;
+  double cpu_time;
+  double gpu_time;
   float ratio;
 };
 
@@ -291,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   // Output events table
   std::cout.setf(std::ios::left);
   std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total";
+  if (g_state == ProfilerState::kAll) {
+    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
+              << std::setw(data_width * 2) << "GPU Time (Ratio)";
+  }
+  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
             << "Max." << std::setw(data_width) << "Ave."
             << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
@@ -300,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
       const EventItem& event_item = events_table[i][j];
       std::cout << std::setw(name_width) << event_item.name
                 << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.total_time;
+      if (g_state == ProfilerState::kAll) {
+        std::cout << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.cpu_time,
+                         (event_item.cpu_time / event_item.total_time))
+                  << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.gpu_time,
+                         (event_item.gpu_time / event_item.total_time));
+      }
+      std::cout << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
                 << std::setw(data_width) << event_item.ave_time
                 << std::setw(data_width) << event_item.ratio << std::endl;
@@ -350,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         return a.ave_time > b.ave_time;
       };
       break;
+    case EventSortingKey::kGPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.gpu_time > b.gpu_time;
+      };
+      break;
+    case EventSortingKey::kCPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.cpu_time > b.cpu_time;
+      };
+      break;
     default:
       sorted_domain = "event first end time";
   }
@@ -388,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time = (g_state == ProfilerState::kCUDA ||
-                               g_state == ProfilerState::kAll)
-                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
-                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
+          double event_time = 0;
+          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
+          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
+          if (g_state == ProfilerState::kCUDA) {
+            event_time = gpu_time;
+          } else if (g_state == ProfilerState::kCPU) {
+            event_time = cpu_time;
+          } else {
+            event_time = gpu_time + cpu_time;
+          }
+
           total += event_time;
 
           std::string event_name;
@@ -408,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
                                     event_time, event_time, event_time,
-                                    0.};
+                                    gpu_time,   cpu_time,   0.};
             event_items.push_back(event_item);
           } else {
             int index = event_idx[event_name];
@@ -421,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
             // max time
             event_items[index].max_time =
                 std::max(event_time, event_items[index].max_time);
+            event_items[index].gpu_time += gpu_time;
+            event_items[index].cpu_time += cpu_time;
           }
 
           // remove the push marker from the list
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index e115c554caf383bad29aa5d065ec0126427f8e78..aed276b16e95f954539d3fadac65309314ed34f1 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/profiler.h"
-
 #include <cuda.h>
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace platform {
@@ -22,26 +21,27 @@ namespace platform {
 __global__ void DummyKernel(int *a) { a[0] = 0; }
 
 static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = GetCurrentDeviceId();
-  int count = GetCUDADeviceCount();
+  auto original_device = platform::GetCurrentDeviceId();
+  int count = platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
+    platform::SetDeviceId(i);
     func(i);
   }
-  SetDeviceId(original_device);
+  platform::SetDeviceId(original_device);
 }
 
 void DummyKernelAndEvent() {
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
-      CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+      platform::SetDeviceId(d);
+      cudaStream_t stream;
+      PADDLE_ENFORCE(cudaStreamCreate(&stream));
       Mark("_cuda_startup_");
       int *ptr;
       PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
-      DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
-      dev_ctx->Wait();
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
       PADDLE_ENFORCE(cudaFree(ptr));
-      delete dev_ctx;
     });
   }
 }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 55d94f0fd84a5ef9ba6a386b947f387ec6fd76d4..aec0ae34292d62905de0e1f459b2b6db4554ebb7 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -17,54 +17,13 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-enum EventType { kMark, kPushRange, kPopRange };
-
-class Event {
- public:
-  // The DeviceContext is used to get the cuda stream.
-  // If CPU profiling mode, can pass nullptr.
-  Event(EventType type, std::string name, uint32_t thread_id);
-
-  const EventType& type() const;
-  std::string name() const { return name_; }
-  uint32_t thread_id() const { return thread_id_; }
-
-#ifdef PADDLE_WITH_CUDA
-#ifndef PADDLE_WITH_CUPTI
-  cudaEvent_t event() const { return event_; }
-  int device() const { return device_; }
-#endif
-#endif
-
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
-
- private:
-  EventType type_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/event.h"
 #ifdef PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_CUPTI
-  int64_t gpu_ns_ = 0;
-
- public:
-  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
-    gpu_ns_ += end_ns - start_ns;
-  }
-
- private:
-#else
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
-};
+namespace paddle {
+namespace platform {
 
 enum ProfilerState {
   kDisabled,  // disabled state
@@ -117,7 +76,16 @@ struct RecordBlock {
 std::vector<std::vector<Event>> GetAllEvents();
 
 // Candidate keys to sort the profiling report
-enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+enum EventSortingKey {
+  kDefault,
+  kCalls,
+  kTotal,
+  kMin,
+  kMax,
+  kAve,
+  kCPUTime,
+  kGPUTime
+};
 
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 528fe03c67a282248c551525e899279b561ce5d2..a851488e72d27dfcbd04546d9b531d26257f611c 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
 }
 
 TEST(RecordEvent, RecordEvent) {
-  using paddle::platform::DeviceContext;
   using paddle::platform::Event;
   using paddle::platform::EventType;
   using paddle::platform::RecordEvent;
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3879cd540017ea22b0cf4eee794a172e56716b74..6dae84f016e5db8007b4a4b4df2b5ed7f5cb4f19 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
 
     int numel = memory_size / sizeof(float);
     framework::Tensor tensor =
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
     int numel = memory_size / sizeof(float);
     framework::Tensor tensor =
         ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
     int numel = memory_size / sizeof(float);
 
     framework::Tensor out_side_tensor;
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx =
         static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
 
     size_t memory_size = 500;
     int numel = memory_size / sizeof(float);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 31c3bfa43ffec22059a602e9ff09a33188d72c91..aeabed19abfda3c857f54e5ada54d52bf95e2602 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,8 +34,8 @@ void BindTracer(pybind11::module* m) {
               framework::BlockDesc* block,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
-                        stop_gradient);
+             return self.Trace(op, inputs, outputs, block, expected_place,
+                               stop_gradient);
            })
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
@@ -44,8 +44,8 @@ void BindTracer(pybind11::module* m) {
               framework::BlockDesc* block,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
-                        stop_gradient);
+             return self.Trace(op, inputs, outputs, block, expected_place,
+                               stop_gradient);
            })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 1cd1be8e8d9da8c6a82ceefc3284084bfeda0252..069750e2406bcbf327591641bf624f36969acc25 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -101,7 +101,8 @@ void BindGraph(py::module *m) {
            [](Graph &self, Node &node) { return self.RemoveNode(&node); })
       .def("retrieve_node", &Graph::RetrieveNode,
            return_value_policy::reference)
-      .def("resolve_hazard", &Graph::ResolveHazard);
+      .def("resolve_hazard", &Graph::ResolveHazard)
+      .def("origin_program_desc", &Graph::OriginProgram);
 }
 
 void BindNode(py::module *m) {
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index e729be4a95a58510f1e0162af4216feaa400d971..48fe445b7d01287c37bcf7d4811f687785ca78d5 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) {
              return self.HasVar(name);
            },
            pybind11::return_value_policy::reference)
+      .def("_clear_block", [](pd::BlockDesc &self) { return self.Clear(); },
+           pybind11::return_value_policy::reference)
       .def("_rename_var",
            [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
               const pybind11::bytes &byte_name_new) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 597b36c6bfc038c6e87644b674f39f95609e1d3b..f9e7366779462a3a74797a3f8e61649da42acdf1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -976,6 +976,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
   // -- python binds for parallel executor.
+
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
     ExecutionStrategy allows the user to more preciously control how to run
@@ -1227,9 +1228,9 @@ All parameter, weight, gradient are variables in Paddle.
                 cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const ProgramDesc &,
-                  const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &>())
+                  const std::unordered_set<std::string> &, const std::string &,
+                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
+                  const BuildStrategy &, ir::Graph *>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index a0757b53f37b29de0b3802c345b1ad9db69f16e9..1087f5672459506cc7b824127cd822c0df7ba566 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -73,7 +73,7 @@ int main() {
   PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
 
   // init all parameters
-  executor.Run(*startup_program.get(), &scope, 0);
+  executor.Run(*startup_program, &scope, 0);
 
   // prepare data
   auto x_var = scope.Var("x");
@@ -101,7 +101,7 @@ int main() {
   clock_t t1 = clock();
 
   for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
     std::cout << "step: " << i << " loss: "
               << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
               << std::endl;
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index e8731dd51ad698e53b7f10cc781c52134f2d17a8..a7846da8c191ac96e9ad7fb5b3184518e32120b2 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -74,7 +74,7 @@ void Train() {
   float first_loss = 0.0;
   float last_loss = 0.0;
   for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
     if (i == 0) {
       first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
     } else if (i == 99) {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 26b26c9b1faf7bd976b57f1320bff878f1a21770..33e0ec4ee226126374413382fe8fcbdebdf50f9e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -444,6 +444,7 @@ function assert_api_spec_approvals() {
                "paddle/fluid/framework/ir/node.h"
                "paddle/fluid/framework/ir/graph.h"
                "paddle/fluid/framework/framework.proto"
+               "python/paddle/fluid/compiler.py"
                "paddle/fluid/operators/distributed/send_recv.proto.in")
     for API_FILE in ${API_FILES[*]}; do
       API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index fa79db19ee895c028f9cad0f4212aaff0e513784..ab4011383824c67a81feba7f2a20d3aff5f4fc8f 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,8 +17,10 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework
 
 from . import core
+from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -36,7 +38,7 @@ def _place_obj(place):
 
 class CompiledProgram(object):
     """
-    Compiles a Program for execution.
+    Compiles to Graph for execution.
 
     1. Users first create the program with layers.
     2. Optionally, users use CompiledProgram to optimize the program before run.
@@ -51,7 +53,7 @@ class CompiledProgram(object):
 
     Example:
         .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(startup)
             compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
@@ -62,11 +64,25 @@ class CompiledProgram(object):
                                      fetch_list=[loss.name])
 
     Args:
-        program: Program instance that contains the model logic.
+        program_or_graph (Graph|Program): If it's Program, it will be first
+            lowered to a graph for further optimizations. If it's a graph
+            (potentially optimized before), it will be directly used for
+            further optimizations. Note: graph is only supported when compiled
+            with with_data_parallel option.
     """
 
-    def __init__(self, program):
-        self._program = program
+    def __init__(self, program_or_graph):
+        if isinstance(program_or_graph, core.Graph):
+            self._graph = program_or_graph
+            self._program = None
+        elif isinstance(program_or_graph, framework.Program):
+            self._graph = core.Graph(program_or_graph.desc)
+            self._program = program_or_graph
+        else:
+            raise ValueError("Wrong program_to_graph type: %s" %
+                             type(program_or_graph))
+
+        self._program_desc = self._graph.origin_program_desc()
         self._scope = None
         self._place = None
         self._executor = None
@@ -101,6 +117,7 @@ class CompiledProgram(object):
             self
         """
         assert not self._is_data_parallel, "Already compiled with parallel."
+        assert not self._is_inference, "Cannot compile both data parallel and inference"
         self._is_data_parallel = True
         self._build_strategy = build_strategy
         self._exec_strategy = exec_strategy
@@ -110,6 +127,8 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
+        self._build_strategy.is_distribution = framework.is_pserver_mode(
+            self._program)
         return self
 
     def with_inference_optimize(self, config):
@@ -120,11 +139,13 @@ class CompiledProgram(object):
         Returns:
             self
         """
+        assert not self._is_data_parallel, "Cannot compile both data parallel and inference"
+        assert not self._is_inference, "Already compiled with inference"
+
         assert any([
             isinstance(config, InferNativeConfig),
             isinstance(config, InferAnalysisConfig)
         ])
-        self._is_data_parallel = False
         self._is_inference = True
         self._infer_config = config
         return self
@@ -173,37 +194,41 @@ class CompiledProgram(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 self._exec_strategy.num_threads = cpu_num * 2
 
-        trainers_endpoints = self._program._trainers_endpoints
-
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
         if self._build_strategy.memory_optimize is None:
-            self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
+            self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
         if self._build_strategy.enable_inplace is None:
-            self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+            self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
+
+        # TODO(wuyi): trainer endpoings should be passed in through
+        # build_strategy, not program.xxx.
+        if self._program and self._build_strategy.num_trainers > 1 and \
+                self._program._trainers_endpoints:
+            tps = self._program._trainers_endpoints
 
-        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
             assert self._build_strategy.num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
-            self._build_strategy.trainers_endpoints = trainers_endpoints
-
-        self._persistable_vars = set([
-            cpt.to_text(v.name)
-            for v in [
-                var for var in self._program.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
+                tps), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = tps
+
+        self._persistable_vars = []
+        for block_id in range(self._program_desc.num_blocks()):
+            bdesc = self._program_desc.block(block_id)
+            self._persistable_vars.extend([
+                cpt.to_text(v.name()) for v in bdesc.all_vars()
+                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
+            ])
 
         places = list(map(_place_obj, self._places))
+
         return core.ParallelExecutor(
-            places, self._persistable_vars, self._program.desc,
+            places,
+            set(self._persistable_vars),
             cpt.to_text(self._loss_name)
             if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy)
+            self._exec_strategy, self._build_strategy, self._graph)
 
     def _compile_inference(self):
-        assert self._is_data_parallel is False
         return core.create_paddle_predictor(self._infer_config)
 
     def _compile(self, scope, place):
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8815911eaeb36067987c0490d7a4f3e909789499..c0191a34deaa5ab3b53b5fe4f33cb5449a2db8b3 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -538,6 +538,8 @@ class Executor(object):
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
+            # TODO(panyx0718): executor should be able to run graph.
+            assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
             return self._run(
                 program._program,
                 self._default_executor,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 15367c724e5304fed78ef58f8a27932e1d6de318..7fe3120acc756d9335e5adbc2e8cfb4270e41c02 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -87,6 +87,15 @@ def _current_expected_place():
     return _imperative_current_expected_place_
 
 
+def is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -378,16 +387,19 @@ class Variable(object):
                 # get_capacity is implemented
                 pass
 
-        self.block.vars[name] = self
-        self.op = None
-        self.stop_gradient = stop_gradient
-        self.is_data = is_data
         if _in_imperative_mode():
+            # record vars in tracer rather than blocks
             self._ivar = kwargs.get("ivar", None)
             if not self._ivar:
-                self._ivar = core.VarBase()
+                self._ivar = core.VarBase(stop_gradient)
             self._ivar.desc = self.desc
-            self._ivar.stop_gradient = stop_gradient
+            if persistable:
+                self.block.vars[name] = self
+        else:
+            self.block.vars[name] = self
+        self.op = None
+        self.stop_gradient = stop_gradient
+        self.is_data = is_data
 
     def _numpy(self):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
@@ -723,7 +735,6 @@ class Operator(object):
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
-
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
@@ -731,6 +742,7 @@ class Operator(object):
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
+
             self.inputs = defaultdict(list)
             if inputs is not None:
                 for k, v in six.iteritems(inputs):
@@ -738,6 +750,7 @@ class Operator(object):
                         self.inputs[k].append(v._ivar)
                     elif isinstance(v, list) or isinstance(v, tuple):
                         self.inputs[k].extend([var._ivar for var in v])
+
             self.outputs = defaultdict(list)
             if outputs is not None:
                 for k, v in six.iteritems(outputs):
@@ -1187,6 +1200,15 @@ class Block(object):
         else:
             raise ValueError("Var {0} is not found recursively".format(name))
 
+    def _clear_block(self):
+        # TODO(minqiyang): move this to backward_hooks
+        self.desc._clear_block()
+
+        for name in self.vars.keys():
+            assert self.vars[name].persistable
+
+        del self.ops[:]
+
     def all_parameters(self):
         return list(self.iter_parameters())
 
@@ -1317,18 +1339,31 @@ class Block(object):
             inputs=kwargs.get("inputs", None),
             outputs=kwargs.get("outputs", None),
             attrs=kwargs.get("attrs", None))
+
+        if _in_imperative_mode():
+            # record ops in tracer rather than blocks
+            #
+            # TODO(minqiyang): add op stop_gradient support in static mode too.
+            # currently, we only support stop_gradient in imperative mode.
+            self._trace_op(op, kwargs.get("stop_gradient", False))
         self.ops.append(op)
 
-        # TODO(minqiyang): add stop_gradient support in static mode too.
-        # currently, we only support stop_gradient in imperative mode.
-        self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
     def _trace_op(self, op, stop_gradient=False):
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
-                                       _imperative_current_expected_place_,
-                                       stop_gradient)
+        backward_refs = _imperative_tracer().trace(
+            op.iop, op.inputs, op.outputs, self.desc,
+            _imperative_current_expected_place_, stop_gradient)
+
+        # TODO(minqiyang): support backward_hooks to eager remove backward_refs
+        op.backward_refs = defaultdict(list)
+        for k, v in six.iteritems(op.inputs):
+            if k in backward_refs:
+                op.backward_refs[k] = op.inputs[k]
+
+        for k, v in six.iteritems(op.outputs):
+            if k in backward_refs:
+                op.backward_refs[k] = op.outputs[k]
 
     def _insert_op(self, index, *args, **kwargs):
         """
@@ -1383,7 +1418,8 @@ class Block(object):
             outputs=kwargs.get("outputs", None),
             attrs=kwargs.get("attrs", None))
         self.ops.insert(0, op)
-        self._trace_op(op, kwargs.get("stop_gradient", False))
+        if _in_imperative_mode():
+            self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
     def _sync_with_cpp(self):
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 59fe6bbf74b80c2260c5b4881fee8807482c9c68..46640ce37a78f7409af7f82d3302a610ccd366b2 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,7 +17,7 @@ import contextlib
 import sys
 import numpy as np
 import collections
-
+from .. import unique_name
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']
 
 
 class Layer(core.Layer):
-    """Layers composed of operators."""
-
-    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+    """Layers composed of operators.
+
+    Args:
+        name_scope: prefix name used by the layer to name parameters.
+            If prefix is "my_model/layer_1", parameter name in MyLayer
+            can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
+            base name and n is an unique suffix auto-generated.
+        dtype: data type for the variables in the layer.
+    """
+
+    def __init__(self, name_scope, dtype=core.VarDesc.VarType.FP32):
+        self._full_name = unique_name.generate(name_scope + "/" +
+                                               self.__class__.__name__)
         self._built = False
         self._dtype = dtype
         self._parameters = collections.OrderedDict()
         self._sub_layers = collections.OrderedDict()
 
+    def full_name(self):
+        """Full name for this layers.
+
+          Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
+
+        Returns full name of this name.
+        """
+        return self._full_name
+
     def parameters(self, include_sublayers=True):
         """Returns a list of Parameters from current and sub-layers.
 
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index c86a373ae4a92053538c93386003f9014c32841f..41655c4f54eecec55bd2c7d2b74adb51efa88b61 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
 
 class Conv2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
                  act=None,
                  param_attr=None,
                  bias_attr=None,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name=name, dtype=dtype)
+        super(Conv2D, self).__init__(name_scope, dtype=dtype)
 
         # TODO(minqiyang): Move this to the top.
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            type(self).__name__,
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name,
             act=act)
 
         self._groups = groups
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):
 
 class Pool2D(layers.Layer):
     def __init__(self,
+                 name_scope,
                  pool_size=-1,
                  pool_type="max",
                  pool_stride=1,
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
                  use_cudnn=True,
                  ceil_mode=False,
                  exclusive=True,
-                 name=None,
                  dtype=core.VarDesc.VarType.FP32):
         if pool_type not in ["max", "avg"]:
             raise ValueError(
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
-        super(Pool2D, self).__init__(name=name, dtype=dtype)
+        super(Pool2D, self).__init__(name_scope, dtype=dtype)
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+        self._helper = LayerHelper(self.full_name(), dtype=dtype)
 
         self._pool_type = pool_type
         self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):
 
 class FC(layers.Layer):
     def __init__(self,
+                 name_scope,
                  size,
                  param_attr=None,
                  bias_attr=None,
                  num_flatten_dims=1,
                  dtype=core.VarDesc.VarType.FP32,
-                 act=None,
-                 name=None):
-        super(FC, self).__init__()
+                 act=None):
+        super(FC, self).__init__(name_scope)
 
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'FC',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            act=act,
-            name=name)
+            act=act)
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -282,6 +280,7 @@ class FC(layers.Layer):
 
 class BatchNorm(layers.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  act=None,
                  is_test=False,
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
                  dtype=core.VarDesc.VarType.FP32,
                  data_layout='NCHW',
                  in_place=False,
-                 name=None,
                  moving_mean_name=None,
                  moving_variance_name=None,
                  do_model_average_for_mean_and_var=False,
                  fuse_with_relu=False,
                  use_global_stats=False):
-        super(BatchNorm, self).__init__()
+        super(BatchNorm, self).__init__(name_scope)
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'batch_norm',
+            self.full_name(),
             param_attr=param_attr,
             bias_attr=bias_attr,
-            name=name,
             act=act)
 
         if dtype == core.VarDesc.VarType.FP16:
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
     constructor.
 
     Args:
+        name_scope: See base class.
         size(tuple|list): The shape of the look up table parameter. It should
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
     """
 
     def __init__(self,
+                 name_scope,
                  size,
                  is_sparse=False,
                  is_distributed=False,
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
                  param_attr=None,
                  dtype='float32'):
 
-        super(Embedding, self).__init__()
+        super(Embedding, self).__init__(name_scope)
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
             assert self._is_sparse is True and self._is_distributed is False
 
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper('embedding', param_attr=param_attr)
+        self._helper = LayerHelper(self.full_name(), param_attr=param_attr)
         self._w = self._helper.create_parameter(
             attr=self._param_attr,
             shape=self._size,
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 7d1636774c6e27ec8090ac01710e23beed5fd0e8..65864ca7e09cd4f0760637198d48154eed025c65 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -34,6 +34,9 @@ class LayerHelper(object):
         self.kwargs = kwargs
         self.layer_type = layer_type
         name = self.kwargs.get('name', None)
+        # TODO(panyx0718, minqiyang): imperative mode
+        # can not use both `layer_type` and `name`. Deprecate LayerHelper
+        # and write a Helper for imperative mode.
         if name is None:
             self.kwargs['name'] = unique_name.generate(self.layer_type)
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3b43ae0b9cb63a9f4708a680cb1021d74c197550..61a7d4f31d5245e635e2e1fe33e418ce20e94180 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -545,15 +545,16 @@ def yolov3_loss(x,
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
 
     Examples:
-    .. code-block:: python
-
-        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-        anchors = [0, 1, 2]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
-                                        ignore_thresh=0.5, downsample_ratio=32)
+      .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+          anchor_mask = [0, 1, 2]
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+                                          anchor_mask=anchor_mask, class_num=80,
+                                          ignore_thresh=0.7, downsample_ratio=32)
     """
     helper = LayerHelper('yolov3_loss', **locals())
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1a7d076835841e3c1b8a90a43437eff13645fb8a..250dc24bd8f028b22d04b3f8dde082c7e236e402 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -87,6 +87,7 @@ __all__ = [
     'transpose',
     'im2sequence',
     'nce',
+    'sampled_softmax_with_cross_entropy',
     'hsigmoid',
     'beam_search',
     'row_conv',
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
                   candidate_activation='tanh',
                   proj_activation='tanh',
                   dtype='float32',
-                  name=None):
+                  name=None,
+                  h_0=None,
+                  c_0=None,
+                  cell_clip=None,
+                  proj_clip=None):
     """
     **Dynamic LSTMP Layer**
 
@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the projection size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        cell_clip(float): If provided the cell state is clipped
+                             by this value prior to the cell output activation.
+        proj_clip(float): If `num_proj > 0` and `proj_clip` is
+                            provided, then the projected values are clipped elementwise to within
+                            `[-proj_clip, proj_clip]`.
 
     Returns:
         tuple: A tuple of two output variable: the projection of hidden state, \
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
     batch_hidden = helper.create_variable_for_type_inference(dtype)
     batch_gate = helper.create_variable_for_type_inference(dtype)
     batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        'Input': input,
+        'Weight': weight,
+        'ProjWeight': proj_weight,
+        'Bias': bias
+    }
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, proj_size), \
+            'The shape of h0 should be (batch_size, %d)' % proj_size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    if cell_clip:
+        assert cell_clip >= 0, "cell_clip should not be negtive."
+    if proj_clip:
+        assert proj_clip >= 0, "proj_clip should not be negtive."
 
     helper.append_op(
         type='lstmp',
-        inputs={
-            'Input': input,
-            'Weight': weight,
-            'ProjWeight': proj_weight,
-            'Bias': bias
-        },
+        inputs=inputs,
         outputs={
             'Projection': projection,
             'Cell': cell,
-            'OrderedP0': ordered_proj0,
             'BatchHidden': batch_hidden,
             'BatchGate': batch_gate,
             'BatchCellPreAct': batch_cell_pre_act
         },
         attrs={
             'use_peepholes': use_peepholes,
+            'cell_clip': cell_clip,
+            'proj_clip': proj_clip,
             'is_reverse': is_reverse,
             'gate_activation': gate_activation,
             'cell_activation': cell_activation,
@@ -2441,7 +2473,7 @@ def pool2d(input,
 
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool2d = fluid.layers.pool2d(
                             input=data,
                             pool_size=2,
                             pool_type='max',
@@ -2490,6 +2522,7 @@ def pool2d(input,
     return pool_out
 
 
+@templatedoc()
 def pool3d(input,
            pool_size=-1,
            pool_type="max",
@@ -2501,13 +2534,19 @@ def pool3d(input,
            name=None,
            exclusive=True):
     """
-    This function adds the operator for pooling in 3-dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}
 
     Args:
-        input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
-        pool_type (str): ${pooling_type_comment}
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width
+                          of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
+            is a tuple or list, it must contain three integers, 
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        pool_type (string): ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
         global_pooling (bool): ${global_pooling_comment}
@@ -2520,6 +2559,19 @@ def pool3d(input,
 
     Returns:
         Variable: output of pool3d layer.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32, 32], dtype='float32')
+          pool3d = fluid.layers.pool3d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -2569,7 +2621,27 @@ def adaptive_pool2d(input,
                     require_index=False,
                     name=None):
     """
-    ${comment}
+    **Adaptive Pool2d Operator**
+    The adaptive_pool2d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
+    size, C is the number of channels, H is the height of the feature, and W is
+    the width of the feature. Parameters(pool_size) should contain two elements which
+    represent height and width, respectively. Also the H and W dimensions of output(Out)
+    is same as Parameter(pool_size).
+
+    For average adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
@@ -2579,8 +2651,8 @@ def adaptive_pool2d(input,
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
         pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2661,18 +2733,42 @@ def adaptive_pool3d(input,
                     require_index=False,
                     name=None):
     """
-    ${comment}
+    **Adaptive Pool3d Operator**
+    The adaptive_pool3d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
+    size, C is the number of channels, D is the depth of the feature, H is the height of
+    the feature, and W is the width of the feature. Parameters(pool_size) should contain
+    three elements which represent height and width, respectively. Also the D, H and W
+    dimensions of output(Out) is same as Parameter(pool_size).
+
+    For average adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width of the feature.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (Depth, Height, Width).
+            it must contain three integers, (Depth, Height, Width).
         pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2709,7 +2805,7 @@ def adaptive_pool3d(input,
               name='data', shape=[3, 32, 32], dtype='float32')
           pool_out, mask = fluid.layers.adaptive_pool3d(
                             input=data,
-                            pool_size=[3, 3],
+                            pool_size=[3, 3, 3],
                             pool_type='avg')
     """
     if pool_type not in ["max", "avg"]:
@@ -5765,6 +5861,132 @@ def softmax_with_cross_entropy(logits,
     return loss
 
 
+def sampled_softmax_with_cross_entropy(logits,
+                                       label,
+                                       num_samples,
+                                       num_true=1,
+                                       remove_accidental_hits=True,
+                                       use_customized_samples=False,
+                                       customized_samples=None,
+                                       customized_probabilities=None,
+                                       seed=0):
+    """
+    **Sampled Softmax With Cross Entropy Operator.**
+
+    Cross entropy loss with sampled softmax is used as the output layer for 
+    larger output classes extensively. This operator samples a number of samples
+    for all examples, and computes the softmax normalized values for each 
+    row of the sampled tensor, after which cross-entropy loss is computed. 
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+    
+    For examples with T true labels (T >= 1), we assume that each true label has
+    a probability of 1/T. For each sample, S samples are generated using a
+    log uniform distribution. True labels are concatenated with these samples to
+    form T + S samples for each example. So, assume the shape of logits is
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
+    probability is calculated, which corresponds to the Q(y|x) in 
+    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
+    
+    Logits are sampled according to the sampled labels. Then if 
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+    make its softmax result close to zero. Then sampled logits are subtracted by
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    a softmax with cross entropy.
+
+    Args:
+        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth which is a 2-D tensor. Label is a 
+            Tensor<int64> with shape [N x T], where T is the number of true 
+            labels per example. 
+        num_samples (int): The number for each example, num_samples should be 
+            less than the number of class.
+        num_true(int): The number of target classes per training example.
+        remove_accidental_hits (bool): A flag indicating whether to remove 
+            accidental hits when sampling. If True and if a sample[i, j] 
+            accidentally hits true labels, then the corresponding 
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+            close to zero. Default is True.
+        use_customized_samples (bool): Whether to use custom samples and probabities to sample
+            logits.
+        customized_samples (Variable): User defined samples, which is a 2-D tensor
+            with shape [N, T + S]. S is the num_samples, and T is the number of true 
+            labels per example. 
+        customized_probabilities (Variable): User defined probabilities of samples, 
+            a 2-D tensor which has the same shape with customized_samples.
+        seed (int): The random seed for generating random number, which is used
+            in the process of sampling. Default is 0.
+
+    Returns:
+        Variable: Return the cross entropy loss which is a 2-D tensor with shape
+                  [N x 1].
+
+    Examples:
+        .. code-block:: python
+
+            logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[5], dtype='int64')
+            fc = fluid.layers.fc(input=data, size=100)
+            out = fluid.layers.sampled_softmax_with_cross_entropy(
+                logits=fc, label=label, num_samples=25)
+    """
+    helper = LayerHelper('sample_logits', **locals())
+    samples = helper.create_variable_for_type_inference(dtype='int64')
+    probabilities = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+    sampled_logits \
+        = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
+    sampled_softlabel = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+
+    helper.append_op(
+        type='sample_logits',
+        inputs={
+            'Logits': logits,
+            'Labels': label,
+            'CustomizedSamples': customized_samples,
+            'CustomizedProbabilities': customized_probabilities
+        },
+        outputs={
+            'Samples': samples,
+            'Probabilities': probabilities,
+            'SampledLabels': sampled_label,
+            'SampledLogits': sampled_logits
+        },
+        attrs={
+            'use_customized_samples': use_customized_samples,
+            'uniq': True,
+            'remove_accidental_hits': remove_accidental_hits,
+            'num_samples': num_samples,
+            'seed': seed
+        })
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(
+        type='one_hot',
+        inputs={'X': sampled_label},
+        attrs={'depth': num_samples + 1},
+        outputs={'Out': sampled_softlabel})
+
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': sampled_logits,
+                'Label': sampled_softlabel},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={
+            'soft_label': True,
+            'ignore_index': False,
+            'numeric_stable_mode': False
+        })
+    return loss / num_true
+
+
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
     This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6b4dc4ac89af43271ff4cbb51b02fe78af754a7b..4381727a090bdb1d13fb692e64e8d6fb69bba0d7 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
 _uniform_random_ = generate_layer_fn('uniform_random')
 
 
-def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
+    """
+    This operator initializes a variable with random values sampled from a
+    uniform distribution. The random result is in set [min, max].
+
+    Args:
+        shape (list): The shape of output variable.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
+            float32, float64 etc. Default: float32.
+        min (float): Minimum value of uniform random. Default -1.0.
+        max (float): Maximun value of uniform random. Default 1.0.
+        seed (int): Random seed used for generating samples. 0 means use a
+            seed generated by the system. Note that if seed is not 0, this
+            operator will always generate the same random numbers every time.
+            Default 0.
+
+    Examples:
+        .. code-block:: python
+
+        result = fluid.layers.uniform_random(shape=[32, 784])
+    """
+
     locals_var = locals().keys()
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
     return _uniform_random_(**kwargs)
 
 
-uniform_random.__doc__ = _uniform_random_.__doc__ + """
-Examples:
-
-    >>> result = fluid.layers.uniform_random(shape=[32, 784])
-"""
-
 __all__ += ['hard_shrink']
 
 _hard_shrink_ = generate_layer_fn('hard_shrink')
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 8586670c2481a0f997e84f33860b6df28b3223ae..889156ff74d6eb1108b23f365f1a081c5b8222b2 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
-def _is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else framework.default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class ParallelExecutor(object):
     """
     ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
         # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
         # num_trainers is 1, so the current fields of build_strategy doesn't tell if
         # it's distributed model.
-        build_strategy.is_distribution = _is_pserver_mode(
+        build_strategy.is_distribution = framework.is_pserver_mode(
             main_program) or num_trainers > 1
 
         # step4: get main_program, scope, local_scopes
@@ -185,10 +176,13 @@ class ParallelExecutor(object):
         places = list(map(place_obj, self._places))
 
         # step7: init ParallelExecutor
+        # ParallelExecutor API will be deprecated, don't support parallel graph.
+        self._graph = core.Graph(main.desc)
+
         self.executor = core.ParallelExecutor(
-            places, persistable_vars, main.desc,
+            places, persistable_vars,
             cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy)
+            local_scopes, exec_strategy, build_strategy, self._graph)
 
         self.scope = scope
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index ad94a4b21c347c9a2782437948c20d3b3071c679..0f301de47f53f3fcacd38d1415ebdbd7b4efc8f1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from scipy.special import expit
 from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+import paddle.fluid as fluid
 
 
 class TestMKLDNNReluDim2(TestRelu):
@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
         self.attrs = {"use_mkldnn": True}
 
 
+# Check if primitives already exist in backward
+class TestMKLDNNReluPrimitivesAlreadyExist(unittest.TestCase):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def test_check_forward_backward(self):
+        place = core.CPUPlace()
+
+        np.random.seed(123)
+        x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
+        out = np.abs(x)
+
+        out_grad = np.random.random_sample(x.shape).astype(np.float32)
+        x_grad = out_grad * np.sign(x)  # Abs grad calculation
+
+        var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
+        var_names = list(var_dict.keys())
+        ground_truth = {name: var_dict[name] for name in var_names}
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            block = program.global_block()
+            for name in ground_truth:
+                block.create_var(
+                    name=name, dtype='float32', shape=ground_truth[name].shape)
+
+            relu_op = block.append_op(
+                type="abs",
+                inputs={"X": block.var('x'), },
+                outputs={"Out": block.var('out')},
+                attrs={"use_mkldnn": True})
+
+            # Generate backward op_desc
+            grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                relu_op.desc, set(), [])
+            grad_op_desc = grad_op_desc_list[0]
+            new_op_desc = block.desc.append_op()
+            new_op_desc.copy_from(grad_op_desc)
+            for var_name in grad_op_desc.output_arg_names():
+                block.desc.var(var_name.encode("ascii"))
+            grad_op_desc.infer_var_type(block.desc)
+            grad_op_desc.infer_shape(block.desc)
+            for arg in grad_op_desc.output_arg_names():
+                grad_var = block.desc.find_var(arg.encode("ascii"))
+                grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+            exe = fluid.Executor(place)
+
+            # Do at least 2 iterations
+            for i in range(2):
+                out = exe.run(
+                    program,
+                    feed={name: var_dict[name]
+                          for name in ['x', 'out@GRAD']},
+                    fetch_list=['x@GRAD'])
+
+            self.__assert_close(x_grad, out[0], "x@GRAD")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index bf00698d63624d4e20a0853641219a2735d89d25..caf9750e58889ac40c7cdde022f0b6aa5e77fc42 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper
 
 
 class L1(fluid.imperative.Layer):
-    def __init__(self):
-        super(L1, self).__init__()
+    def __init__(self, prefix):
+        super(L1, self).__init__(prefix)
         self._helper = LayerHelper(
-            'MyLayer',
+            self.full_name(),
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.1)))
 
@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):
 
 
 class L2(fluid.imperative.Layer):
-    def __init__(self):
-        super(L2, self).__init__()
-        self.layer1 = L1()
-        self.layer2 = L1()
+    def __init__(self, prefix):
+        super(L2, self).__init__(prefix)
+        self.layer1 = L1(self.full_name())
+        self.layer2 = L1(self.full_name())
 
     def forward(self):
         return self.layer1() + self.layer2()
 
 
 class L3(fluid.imperative.Layer):
-    def __init__(self):
-        super(L3, self).__init__()
-        self.layer1 = L2()
-        self.layer2 = L2()
+    def __init__(self, prefix):
+        super(L3, self).__init__(prefix)
+        self.layer1 = L2(self.full_name())
+        self.layer2 = L2(self.full_name())
 
     def forward(self):
         return self.layer1() + self.layer2()
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
 class TestBaseLayer(unittest.TestCase):
     def test_one_level(self):
         with fluid.imperative.guard():
-            l = L1()
+            l = L1('test_one_level')
             ret = l()
-            self.assertEqual(l.w1.name, "MyLayer_0.w_0")
-            self.assertEqual(l.w2.name, "MyLayer_0.w_1")
+            self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0")
+            self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1")
             self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
 
     def test_three_level(self):
         with fluid.imperative.guard():
-            l = L3()
+            l = L3('test_three_level')
+            names = [p.name for p in l.parameters()]
             ret = l()
+            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0")
+            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1")
+            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0")
+            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1")
+            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0")
+            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1")
             self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index c54e998ea875e1bd27f9816f88db0e38bc488459..dae0c466ee5ea919688b29100f77f17f5f3b8c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -15,7 +15,6 @@
 import contextlib
 import unittest
 import numpy as np
-import sys
 
 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope
 
 
 class MyLayer(fluid.imperative.Layer):
-    def __init__(self):
-        super(MyLayer, self).__init__()
+    def __init__(self, name_scope):
+        super(MyLayer, self).__init__(name_scope)
 
     def forward(self, inputs):
         x = fluid.layers.relu(inputs)
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):
 
 
 class MLP(fluid.imperative.Layer):
-    def __init__(self):
-        super(MLP, self).__init__()
-        self._fc1 = FC(3,
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(),
+                       3,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(4,
+        self._fc2 = FC(self.full_name(),
+                       4,
                        fluid.ParamAttr(
                            initializer=fluid.initializer.Constant(value=0.1)))
 
@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):
 
 
 class SimpleRNNCell(fluid.imperative.Layer):
-    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
-        super(SimpleRNNCell, self).__init__()
+    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
+                 param_attr):
+        super(SimpleRNNCell, self).__init__(name_scope)
         self.step_input_size = step_input_size
         self.hidden_size = hidden_size
         self.output_size = output_size
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):
 
 
 class SimpleRNN(fluid.imperative.Layer):
-    def __init__(self):
-        super(SimpleRNN, self).__init__()
+    def __init__(self, name_scope):
+        super(SimpleRNN, self).__init__(name_scope)
         self.seq_len = 4
         self._cell = SimpleRNNCell(
+            self.full_name(),
             3,
             3,
             3,
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.imperative.Layer()
+            l = fluid.imperative.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_pylayer_func_id(self):
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(var_inp)[0]
             self.assertIsNotNone(x)
             dy_out = x._numpy()
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer()
+            l = MyLayer("my_layer")
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(var_inp)
             dy_out = out._numpy()
             out._backward()
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP()
+            mlp = MLP("mlp")
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
         self.assertTrue(np.allclose(dy_grad, static_grad))
 
         params = mlp.parameters(True)
-        self.assertEqual("FC_0.w_0", params[0].name)
-        self.assertEqual("FC_0.b_0", params[1].name)
-        self.assertEqual("FC_1.w_0", params[2].name)
-        self.assertEqual("FC_1.b_0", params[3].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name)
         self.assertEqual(len(params), 4)
 
         sublayers = mlp.sublayers(True)
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
         with fluid.imperative.guard():
             var_inp = fluid.imperative.base.to_variable(np_inp)
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn.forward(var_inp)
             dy_out = outs[3]._numpy()
             outs[3]._backward()
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[1, 4, 3], append_batch_size=False)
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn(inp)
             param_grads = fluid.backward.append_backward(outs[3])
             exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 33c196d1ab52b393491561e75054e6c323fce18d..a80202d6dddacaa4cb6fa3efd3c3dfd5b0ab4400 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable
 
 
 class Discriminator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Discriminator, self).__init__()
-        self._fc1 = FC(size=32, act='elu', name="d_fc1")
-        self._fc2 = FC(size=1, name="d_fc2")
+    def __init__(self, name_scope):
+        super(Discriminator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=32, act='elu')
+        self._fc2 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):
 
 
 class Generator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Generator, self).__init__()
-        self._fc1 = FC(size=64, act='elu', name="g_fc1")
-        self._fc2 = FC(size=64, act='elu', name="g_fc2")
-        self._fc3 = FC(size=1, name="g_fc3")
+    def __init__(self, name_scope):
+        super(Generator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=64, act='elu')
+        self._fc2 = FC(self.full_name(), size=64, act='elu')
+        self._fc3 = FC(self.full_name(), size=1)
 
     def forward(self, inputs):
         x = self._fc1(inputs)
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
         scope = fluid.core.Scope()
         with new_program_scope(
                 main=discriminate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             img = fluid.layers.data(
                 name="img", shape=[2, 1], append_batch_size=False)
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd.minimize(d_loss)
 
         with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
 
             noise = fluid.layers.data(
                 name="noise", shape=[2, 2], append_batch_size=False)
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
             sgd = SGDOptimizer(learning_rate=1e-3)
 
             d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 08b155acc657c3a4a73f5b1d72ac356fc7e83a58..0d0a3bbe0bd47fe0e01761f8b42c92b884a5680a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope
 
 class SimpleImgConvPool(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
                  use_cudnn=False,
                  param_attr=None,
                  bias_attr=None):
-        super(SimpleImgConvPool, self).__init__()
+        super(SimpleImgConvPool, self).__init__(name_scope)
 
         self._conv2d = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
             use_cudnn=use_cudnn)
 
         self._pool2d = Pool2D(
+            self.full_name(),
             pool_size=pool_size,
             pool_type=pool_type,
             pool_stride=pool_stride,
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):
 
 
 class MNIST(fluid.imperative.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__()
+    def __init__(self, name_scope, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(10,
+        self._fc = FC(self.full_name(),
+                      10,
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.NormalInitializer(
                               loc=0.0, scale=scale)),
@@ -101,47 +105,48 @@ class MNIST(fluid.imperative.Layer):
 class TestImperativeMnist(unittest.TestCase):
     def test_mnist_float32(self):
         seed = 90
-        batch_num = 2
+        epoch_num = 1
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             dy_param_init_value = {}
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                dy_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    128, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                cost = mnist(img)
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-                dy_out = avg_loss._numpy()
-
-                if batch_id == 0:
-                    for param in fluid.default_main_program().global_block(
-                    ).all_parameters():
-                        dy_param_init_value[param.name] = param._numpy()
-
-                avg_loss._backward()
-                sgd.minimize(avg_loss)
-                mnist.clear_gradients()
-                dy_param_value = {}
-                for param in fluid.default_main_program().global_block(
-                ).all_parameters():
-                    dy_param_value[param.name] = param._numpy()
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label._stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss._numpy()
+
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param._numpy()
+
+                    avg_loss._backward()
+                    sgd.minimize(avg_loss)
+                    mnist.clear_gradients()
+
+                    fluid.default_main_program().global_block()._clear_block()
+
+                    dy_param_value = {}
+                    for param in mnist.parameters():
+                        dy_param_value[param.name] = param._numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            mnist = MNIST()
+            mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             img = fluid.layers.data(
                 name='pixel', shape=[1, 28, 28], dtype='float32')
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
             # initialize params and fetch them
             static_param_init_value = {}
             static_param_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in mnist.parameters():
                 static_param_name_list.append(param.name)
 
             out = exe.run(fluid.default_startup_program(),
@@ -176,26 +180,29 @@ class TestImperativeMnist(unittest.TestCase):
             for i in range(len(static_param_name_list)):
                 static_param_init_value[static_param_name_list[i]] = out[i]
 
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_out = out[0]
-                for i in range(1, len(out)):
-                    static_param_value[static_param_name_list[i - 1]] = out[i]
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+
+                    fetch_list = [avg_loss.name]
+                    fetch_list.extend(static_param_name_list)
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_param_value = {}
+                    static_out = out[0]
+                    for i in range(1, len(out)):
+                        static_param_value[static_param_name_list[i - 1]] = out[
+                            i]
+
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
 
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
         self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 7cf3bf13d2072bac3bf7bd74760c56e7ac12b8a7..c8e42d5ede57896b0d5c09a2334709ced2d16a3f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -28,12 +28,13 @@ from paddle.fluid.backward import append_backward
 
 class SimpleLSTMRNN(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  num_steps,
                  num_layers=2,
                  init_scale=0.1,
                  dropout=None):
-        super(SimpleLSTMRNN, self).__init__()
+        super(SimpleLSTMRNN, self).__init__(name_scope)
         self._hidden_size = hidden_size
         self._num_layers = num_layers
         self._init_scale = init_scale
@@ -130,13 +131,14 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
 
 class PtbModel(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  hidden_size,
                  vocab_size,
                  num_layers=2,
                  num_steps=20,
                  init_scale=0.1,
                  dropout=None):
-        super(PtbModel, self).__init__()
+        super(PtbModel, self).__init__(name_scope)
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.init_scale = init_scale
@@ -146,12 +148,14 @@ class PtbModel(fluid.imperative.Layer):
         from paddle.fluid.layer_helper import LayerHelper
         self._helper = LayerHelper('PtbModel', act="tanh")
         self.simple_lstm_rnn = SimpleLSTMRNN(
+            self.full_name(),
             hidden_size,
             num_steps,
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
         self.embedding = Embedding(
+            self.full_name(),
             size=[vocab_size, hidden_size],
             dtype='float32',
             is_sparse=False,
@@ -226,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             fluid.default_main_program().random_seed = seed
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
@@ -265,6 +270,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
             ptb_model = PtbModel(
+                "ptb_model",
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 128d18621db8374c6c385dddbefc0d29e760a02f..4892495e1108e6d2a7e96cab88dc7668e360d79f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -70,15 +70,17 @@ def optimizer_setting(params):
 
 class ConvBNLayer(fluid.imperative.Layer):
     def __init__(self,
+                 name_scope,
                  num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
                  groups=1,
                  act=None):
-        super(ConvBNLayer, self).__init__()
+        super(ConvBNLayer, self).__init__(name_scope)
 
         self._conv = Conv2D(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
             act=None,
             bias_attr=None)
 
-        self._batch_norm = BatchNorm(num_filters, act=act)
+        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):
 
 
 class BottleneckBlock(fluid.imperative.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__()
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True):
+        super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
             act='relu')
         self.conv1 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu')
         self.conv2 = ConvBNLayer(
+            self.full_name(),
             num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         if not shortcut:
             self.short = ConvBNLayer(
+                self.full_name(),
                 num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):
 
         y = fluid.layers.elementwise_add(x=short, y=conv2)
 
-        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        layer_helper = LayerHelper(self.full_name(), act='relu')
         return layer_helper.append_activation(y)
 
 
 class ResNet(fluid.imperative.Layer):
-    def __init__(self, layers=50, class_dim=102):
-        super(ResNet, self).__init__()
+    def __init__(self, name_scope, layers=50, class_dim=102):
+        super(ResNet, self).__init__(name_scope)
 
         self.layers = layers
         supported_layers = [50, 101, 152]
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
         num_filters = [64, 128, 256, 512]
 
         self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            self.full_name(),
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu')
         self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.full_name(),
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
                     BottleneckBlock(
+                        self.full_name(),
                         num_channels=num_channels,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
                 shortcut = True
 
         self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
 
         import math
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
-        self.out = FC(size=class_dim,
+        self.out = FC(self.full_name(),
+                      size=class_dim,
                       act='softmax',
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.Uniform(-stdv, stdv)))
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
-        batch_num = 1
+        batch_num = 2
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
             np.random.seed(seed)
             import random
@@ -264,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase):
                 optimizer.minimize(avg_loss)
                 resnet.clear_gradients()
 
+                fluid.default_main_program().global_block()._clear_block()
+
                 dy_param_value = {}
                 for param in resnet.parameters():
                     dy_param_value[param.name] = param._numpy()
@@ -275,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
-            resnet = ResNet()
+            resnet = ResNet("resnet")
             optimizer = optimizer_setting(train_parameters)
 
             np.random.seed(seed)
@@ -297,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase):
             static_param_init_value = {}
             static_param_name_list = []
             static_grad_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                 static_param_name_list.append(param.name)
-            for param in fluid.default_main_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                 if not param.stop_gradient:
                     static_grad_name_list.append(param.name +
                                                  core.grad_var_suffix())
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e7bc1601a54c8615e0e787d74145aa4987b6cb88..30194f8cacfea2361ffe4afe537287a261cf470b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_sampled_softmax_with_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            logits = layers.data(name='Logits', shape=[256], dtype='float64')
+            label = layers.data(name='Label', shape=[1], dtype='int64')
+            num_samples = 25
+            output = layers.sampled_softmax_with_cross_entropy(logits, label,
+                                                               num_samples)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     @decorators.prog_scope()
     def test_nce(self):
         window_size = 5
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index 9c3ec45515ffe0a07541fd9cfb7e92b079264071..0645cfedb8089f5618c54672cac91343e5dee285 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -36,12 +36,14 @@ def lstmp(
         w_b=None,  # 1 x 4D
         w_c=None,  # 1 x 3D
         is_reverse=False,
+        proj_clip=0.0,
+        cell_clip=0.0,
         act_gate=None,
         act_cell=None,
         act_cand=None,
         act_proj=None):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
-              act_proj):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
+              act_cell, act_cand, act_proj):
         g = np.dot(r_pre, w_r)  # 1 x 4D
         g = g + x
         g = np.reshape(g, (1, g.size))
@@ -55,6 +57,17 @@ def lstmp(
             g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
         c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
 
+        def array_clip(a, clip):
+            size = np.prod(a.shape)
+            new_a = np.reshape(a, (size))
+            for i in range(size):
+                new_a[i] = max(new_a[i], -1.0 * clip)
+                new_a[i] = min(new_a[i], clip)
+            new_a = np.reshape(new_a, a.shape)
+            return new_a
+
+        if cell_clip > 0.0:
+            c = array_clip(c, cell_clip)
         if w_c is None:
             g_o = act_gate(g_o)  # 1 x D
         else:
@@ -64,6 +77,8 @@ def lstmp(
         # projection
         r = np.dot(h, w_rh)
         r = act_proj(r)
+        if proj_clip > 0.0:
+            r = array_clip(r, proj_clip)
         return r, c
 
     def _reverse(x, offset):
@@ -87,13 +102,13 @@ def lstmp(
         # compute one sequence
         seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
-        r_pre = np.dot(h0[i], w_rh)  # 1 x P
-        r_pre = act_proj(r_pre)
+        r_pre = h0[i]
         c_pre = c0[i]  # 1 x D
         for j in range(seq_len):
             # compute one step
-            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
-                                 act_cell, act_cand, act_proj)
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
+                                 cell_clip, act_gate, act_cell, act_cand,
+                                 act_proj)
             projection.append(r_pre.flatten())
             cell.append(c_pre.flatten())
 
@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
         T = sum(self.lod[0])
         N = len(self.lod[0])
-
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            h0 = np.random.normal(size=(N, self.P)).astype('float64')
             c0 = np.random.normal(size=(N, self.D)).astype('float64')
         else:
-            h0 = np.zeros((N, self.D)).astype('float64')
+            h0 = np.zeros((N, self.P)).astype('float64')
             c0 = np.zeros((N, self.D)).astype('float64')
         w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
         if self.use_peepholes:
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        proj_clip = 0.1
+        cell_clip = 0.1
         r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+                     proj_clip, cell_clip, ACTIVATION[self.act_gate],
+                     ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
+                     ACTIVATION[self.act_proj])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
 
@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.attrs = {
             'use_peepholes': self.use_peepholes,
             'is_reverse': self.is_reverse,
+            'proj_clip': proj_clip,
+            'cell_clip': cell_clip,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
             'candidate_activation': self.act_cand,
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2)
+            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005)
 
 
 class TestLstmpOpHasInitial(TestLstmpOp):
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
             ['Projection'],
+            numeric_grad_delta=0.0000005,
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Weight'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
         N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
         self.check_grad(
             ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
             max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
             no_grad_set=set('C0'))