Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dequantize-reuse

c4187dbd · xiaoli.liu@intel.com · 70759d18 · 6724be2b · c4187dbd · c4187dbd
145 changed file
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)

 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@@ -18,7 +18,7 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

-### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -26,9 +26,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85

 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85

 ## Installation

-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.

 ## Documentation

-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.

 - [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)

  You can run distributed training jobs on MPI clusters.

- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)

   We appreciate your contributions!


--- a/README_cn.md
+++ b/README_cn.md
@@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文

 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效

 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)

-### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
@@ -24,9 +24,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85

 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85

 ## 安装

-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)

 ## 文档

-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档

 - [深度学习101](https://github.com/PaddlePaddle/book)

  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行

- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)

  可以在MPI集群上运行分布式训练任务

- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)

   新的API支持代码更少更简洁的程序

- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)

   欢迎您的贡献!


--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -168,6 +168,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
+        
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
 paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
 paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
 paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
 paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
@@ -92,7 +92,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
 paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
-paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
+paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
+paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
 paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0))
 paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
  return res;
 }

+void BlockDesc::Clear() {
+  // clear all ops
+  ops_.clear();
+
+  // clear all vars which are not persistable
+  for (auto it = vars_.begin(); it != vars_.end();) {
+    if (it->second->Persistable()) {
+      ++it;
+    } else {
+      vars_.erase(it++);
+    }
+  }
+}
+
 void BlockDesc::Flush() {
  for (auto &op_desc : ops_) {
    op_desc->Flush();

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -97,6 +97,8 @@ class BlockDesc {

  std::vector<OpDesc *> AllOps() const;

+  void Clear();
+
  size_t OpSize() const { return ops_.size(); }

  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  out_layout =
      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;

-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
-      pool.Get(expected_kernel_type.place_));
-  auto& cpu_engine = dev_ctx->GetEngine();
-
  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
  std::vector<int> out_tz = in_tz;

@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: %s", in.type());
  memory::data_type out_type = in_type;

-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
-  auto out_format =
-      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());

-  if (in_format != out_format) {
+  // tempory mem pd fr out , to make reorder
+  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+      paddle::framework::vectorize2int(out->dims()),
+      mkldnn::memory::format::blocked, out_type);
+  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
    void* in_data = GetDataFromTensor(in, in_type);
    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());

-    auto in_memory =
-        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-    auto out_memory =
-        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
+    auto out_memory = memory(out_mem_pd, out_data);

    platform::Reorder(in_memory, out_memory);
  } else {
    out->ShareDataWith(in);
  }
  out->set_layout(out_layout);
-  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(memory::format::format_undef);
 #endif
 }


--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        // TODO(jczaja): Remove that once all mkldnn ops
+        // are modified to work with mkldnn_blocked
+        auto mkldnn_fmt = [&](int rank) {
+          switch (rank) {
+            case 5:
+              return mkldnn::memory::format::ncdhw;
+            case 4:
+              return mkldnn::memory::format::nchw;
+            case 3:
+              return mkldnn::memory::format::ncw;
+            case 2:
+              return mkldnn::memory::format::nc;
+            case 1:
+              return mkldnn::memory::format::x;
+            default:
+              return mkldnn::memory::format::blocked;
+          }
+        };

-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-                                                        ToMKLDNNFormat(lin));
+        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+            paddle::framework::vectorize2int(out.dims()),
+            mkldnn_fmt(out.dims().size()));

-        out.ShareDataWith(input_tensor);
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
+        out.set_mkldnn_prim_desc(out_mem_pd);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  std::unordered_map<std::string, int> vars;
  // TODO(gongwb): use graph topology sort to find the order of operators.
  //               Note that must assert topology sort is stable
-  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
    auto outputs = op_desc->Outputs();
    for (auto& o_it : outputs) {
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(

 REGISTER_PASS(all_reduce_deps_pass,
              paddle::framework::details::AllReduceDepsPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -135,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  void AppendMultiDevPass(const BuildStrategy &strategy) {
    ir::Pass *multi_devices_pass;
    if (strategy_.is_distribution_) {
+      VLOG(3) << "multi device parameter server mode";
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        VLOG(3) << "multi devices collective mode with allreduce";
        multi_devices_pass =
            AppendPass("allreduce_mode_multi_devices_pass").get();
      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        VLOG(3) << "multi deivces collective mode with reduce";
        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
      } else {
        PADDLE_THROW("Unknown reduce strategy.");
@@ -171,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }

 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
+    const std::vector<platform::Place> &places,
    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -182,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);

-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
@@ -200,41 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
-    } else if (pass->Type() == "memory_optimize_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      const std::vector<OpDesc *> *all_op_descs =
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
-      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
-                                              all_op_descs);  // take ownership
-
-      pass->Erase(kAllOpDescs);
-      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
-
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                << ", num_trainers:" << num_trainers_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
-    } else if (pass->Type() == "inplace_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      graph->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
      if (!use_cuda) {
        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -114,7 +114,7 @@ struct BuildStrategy {

  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                   const std::vector<platform::Place> &places,
                                   const std::string &loss_var_name,
                                   const std::vector<Scope *> &local_scopes,

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -24,12 +24,11 @@ namespace details {

 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    : strategy_(strategy),
      local_scopes_(local_scopes),
      places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
      pool_(strategy.num_threads_),
      prepare_pool_(1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
        }
      }
      if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_.ReThrow();
      }
    }
    num_complete += num_comp;
  }
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
  return fetches;
 }


--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
  const ir::Graph &Graph() const override;

@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;

  std::unordered_map<OpHandleBase *, int> op_deps_;
  std::vector<OpHandleBase *> bootstrap_ops_;

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -33,10 +33,10 @@ namespace details {
 using paddle::framework::VarDesc;

 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
+  PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
+                 "Graph has no attribute of kStaleProgramOpDescs.");
  // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);

  // 2. topology sort order
  auto nodes = graph.Nodes();
@@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      }
    }
  }
+
+  for (auto* op : ops_) {
+    unlived_vars_[op] = std::set<std::string>();
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
+    }
+  }
 }

 void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
                                           const std::string& new_node,
                                           int begin_idx) {
+  std::vector<bool> need_update(ops_.size(), false);
  // update graph from begin idx to the end
  for (size_t i = begin_idx; i != ops_.size(); ++i) {
    auto* op = ops_[i];
@@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
    if (live_in_[op].find(old_node) != live_in_[op].end()) {
      live_in_[op].erase(old_node);
      live_in_[op].insert(new_node);
+      need_update[i] = true;
    }
    if (live_out_[op].find(old_node) != live_out_[op].end()) {
      live_out_[op].erase(old_node);
      live_out_[op].insert(new_node);
+      need_update[i] = true;
+    }
+  }
+
+  for (size_t i = begin_idx; i < ops_.size(); ++i) {
+    if (!need_update[i]) continue;
+    auto* op = ops_[i];
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
    }
  }
 }

-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
  auto it = live_in_.find(op);
  PADDLE_ENFORCE(
      it != live_in_.end(),
@@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
  return it->second;
 }

-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
  auto it = live_out_.find(op);
  PADDLE_ENFORCE(
      it != live_out_.end(),
@@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
  return it->second;
 }

-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
  auto it = uses_.find(op);
  PADDLE_ENFORCE(
      it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+      string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
+  auto it = unlived_vars_.find(op);
+  PADDLE_ENFORCE(
+      it != unlived_vars_.end(),
+      string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
+  return it->second;
  return it->second;
 }

-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }

 std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }


--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -92,10 +92,11 @@ class ControlFlowGraph {
  void RenameVarInCFGGraph(const std::string& old_node,
                           const std::string& new_node, int begin_idx);

-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
+  const std::set<std::string>& LiveIn(ir::Node* op) const;
+  const std::set<std::string>& LiveOut(ir::Node* op) const;
+  const std::set<std::string>& Use(ir::Node* op) const;
+  const std::set<std::string>& Unlived(ir::Node* op) const;
+  const std::vector<ir::Node*>& Ops() const;
  std::vector<ir::Node*>& Ops();

  // for ssa-graph nodes
@@ -117,6 +118,7 @@ class ControlFlowGraph {
  VarSetMap live_out_;
  VarSetMap uses_;  // op inputs
  VarSetMap defs_;  // op outputs
+  std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;

  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
  // prepare ir graph
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  ControlFlowGraph cfg(graph);
  cfg.LiveVariableAnalysis();
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  auto nodes = SortOpLikeDescOrder(graph);
  auto op_descs = prog.Block(0).AllOps();
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto nodes = graph.Nodes();
  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
  auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  ir::Graph graph(prog);

  auto find_node_in_graph = [&](std::string s) {
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {

  // cached desc different with real one
  // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();

  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
    return ret;
  };

+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
+
  // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
  auto nodes = graph.Nodes();
  for (auto node : nodes) {
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();

  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
    return ret;
  };

-  auto op_descs = prog.Block(0).AllOps();
  // add node
  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -118,8 +118,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
    }
    // fill the pool
-    for (auto var : cfg_->LiveIn(op)) {
-      if (cfg_->LiveOut(op).count(var) == 0) {
+    for (auto& var : cfg_->Unlived(op)) {
      ir::Node* var_node = cfg_->GetNodeByName(var, op);
      if (var_node == nullptr || var_node->IsCtrlVar()) continue;
      if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
@@ -127,7 +126,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
    }
  }
-  }
  graph->ResolveHazard(var_nodes_);

  return graph;
@@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,

 REGISTER_PASS(memory_optimize_pass,
              paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -937,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }

 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (need_broadcast_var_ ||
-      (UseGPU() &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+  // broad cast received parameters when training in parameter server mode.
+  if (need_broadcast_var_) {
+    // There are 4 conditions:
+    // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
+    // Need to broadcast received parameters to other GPU.
+    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
+    // broadcast received parameters to other GPU.
+    // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
+    // broadcast received parameters to other scope.
+    // 4. CPU && Reduce: because all parameters share the same memory, did not
+    // broadcast received parameters.
+    if (!UseGPU() &&
+        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+      return;
+    }
    if (strategy_.fuse_broadcast_op_) {
      CreateFusedBroadcastOp(result, bcast_var_name_set_);
    } else {

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -20,8 +20,7 @@ namespace framework {
 namespace details {

 std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
-    std::unique_ptr<ir::Graph> &&graph) {
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
  std::vector<std::unique_ptr<ir::Graph>> graphs;
  graphs.reserve(places_.size());
  for (size_t i = 0; i < places_.size(); ++i) {
@@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(

 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    : strategy_(std::move(strategy)),
      local_scopes_(std::move(local_scopes)),
      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
      places_(std::move(places)),
-      main_prog_(main_prog),
      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
      // attrs.
-      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
+      graphs_(SeparateMultiDevicesGraph(graph)) {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());

  auto seq_allreduce_pass =
      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Erase(details::kAllOpDescs);
-  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
-      details::kAllOpDescs,
-      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
  for (size_t i = 0; i < graphs_.size(); ++i) {
    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
  }
@@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
          << " to run the operators of the graph on each device.";
  for (size_t i = 0; i < places.size(); ++i) {
    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
  }
 }


--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           const framework::ProgramDesc &main_prog,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
  ~ParallelSSAGraphExecutor() final = default;

  const ir::Graph &Graph() const override { return *graphs_[0]; }
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {

 private:
  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      std::unique_ptr<ir::Graph> &&graph);
+      ir::Graph *graph);

  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::unique_ptr<::ThreadPool> pool_{nullptr};
  std::vector<platform::Place> places_;
-  framework::ProgramDesc main_prog_;
  std::vector<std::unique_ptr<ir::Graph>> graphs_;

  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;

--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
  static std::unordered_set<std::string> skip_dist_ops{
      "send", "recv", "send_barrier", "fetch_barrier"};

-  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
  std::vector<ir::Node *> op_node_list;
  op_node_list.reserve(ops.size());

@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(

 REGISTER_PASS(sequential_execution_pass,
              paddle::framework::details::SequentialExecutionPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
-    : graph_(std::move(graph)),
+    const std::vector<platform::Place> &places, ir::Graph *graph)
+    : graph_(graph),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
      local_scopes_(local_scopes),
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_holder_.ReThrow();
      } else {
        continue;
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  }
  PADDLE_ENFORCE(ready_ops.empty());
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);

  return fetch_data;
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);

  const ir::Graph &Graph() const override { return *graph_; }
  // Run a SSAGraph by a thread pool
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);

 private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
 endif ()
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
      var->inputs.push_back(node);
    }
  }
+  Set<const std::vector<OpDesc *>>(
+      details::kStaleProgramOpDescs,
+      new std::vector<OpDesc *>(program.Block(0).AllOps()));
  return var_nodes;
 }


--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -31,7 +31,7 @@ namespace details {

 // This attr is not recommended, because the graph should not dependence
 // the program once it is built.
-constexpr char kAllOpDescs[] = "all_op_descs";
+constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
 }  //  namespace details

 namespace ir {
@@ -195,6 +195,12 @@ class Graph {
    return nullptr;
  }

+  // Returns reference to the original program.
+  // WARN: After a series of passes, the current graph can be quite
+  // different from OriginProgram. Caller shouldn't assume much from
+  // the returned OriginProgram.
+  const ProgramDesc &OriginProgram() const { return program_; }
+
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());

--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+  } else if (type == "elementwise_add") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  }
+  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+}
+
+// (c, weights)->conv->f
+// (f)->elementwise_add->g
+ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // conv+bias, both with MKL-DNN
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}));
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+  SetOp(&prog, "elementwise_add", "eltwise",
+        std::vector<std::string>({"f", "eltwise_bias"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(bool convWithExistingBias) {
+  auto prog = BuildProgramDesc(convWithExistingBias);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  // Init scope, as it is used in pass
+  exe.CreateVariables(prog, 0, true, &scope);
+  if (convWithExistingBias) {
+    InitTensorHolder(&scope, place, "conv_bias");
+    InitTensorHolder(&scope, place, "eltwise_bias");
+  }
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: Conv, Bias, conv_out
+  // Add 1 Node: ConvBias
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_bias op in newly generated graph
+  int conv_bias_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      // check if "conv" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv") {
+        auto input_names = op->InputNames();
+        ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end());
+        auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
+        if (bias.size()) {
+          ++conv_bias_count;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_bias_count, 1);
+}
+
+TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
+
+TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
+
+TEST(ConvBiasFusePass, conv3d) {
+  Conv3DBiasFusePass pass;
+  ASSERT_TRUE(pass.is_conv3d());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -44,10 +44,14 @@ struct TestIsReachable {
  using func = std::function<bool(const std::string&, const std::string&)>;

  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
-    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
+    auto hash = [](const Node* node) -> std::string {
+      return node->Name() + std::to_string(node->id());
+    };
+
+    auto find_node = [&](const std::unique_ptr<ir::Graph>& graph,
                         const std::string& name) -> Node* {
      for (auto& node : GraphTraits::DFS(*graph)) {
-        if (name == node.Name()) {
+        if (name == hash(&node)) {
          return &node;
        }
      }
@@ -55,13 +59,17 @@ struct TestIsReachable {
      return nullptr;
    };

-    return [&](std::string from, const std::string to) -> bool {
+    // update the from and to strings to hashed equivs in loop from graph traits
+    return [&](std::string from, std::string to) -> bool {
      if (from == to) return true;

      std::map<std::string, bool> visited;

      for (auto& node : GraphTraits::DFS(*graph)) {
-        visited[node.Name()] = false;
+        auto hashed = hash(&node);
+        if (node.Name() == from) from = hashed;
+        if (node.Name() == to) to = hashed;
+        visited[hashed] = false;
      }

      visited[from] = true;
@@ -72,15 +80,15 @@ struct TestIsReachable {
      while (!queue.empty()) {
        auto cur = find_node(graph, queue.front());
        queue.pop_front();
-
        if (cur == nullptr) return false;

        for (auto n : cur->outputs) {
-          if (n->Name() == to) return true;
+          auto hashed_name = hash(n);
+          if (hashed_name == to) return true;

-          if (!visited[n->Name()]) {
-            visited[n->Name()] = true;
-            queue.push_back(n->Name());
+          if (!visited[hashed_name]) {
+            visited[hashed_name] = true;
+            queue.push_back(hashed_name);
          }
        }
      }
@@ -166,6 +174,28 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
  RunPassAndAssert(&prog, "a", "relu", 1);
 }

+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionProjectionAsYWithElementwiseAddRelu) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"},
+                               {"bias", "weights", "bias2", "weights2"});
+
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  // right branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+
+  // left branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}},
+        {"Output", "f"});
+
+  SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+
+  RunPassAndAssert(&prog, "a", "relu", 2);
+}
+
 TEST(ConvElementwiseAddMKLDNNFusePass,
     ConvolutionAsYWithElementwiseAddReluNoBias) {
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -21,7 +21,7 @@ namespace ir {

 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  VLOG(3) << "Applies MKL-DNN placement strategy.";
  const auto& op_types_list =
      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
  for (const Node* n : graph->Nodes()) {

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, boost::tribool use_mkldnn) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_mkldnn
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     none
+// f->relu->g                    false
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   true
+// k->relu->l                    true
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", std::vector<std::string>({"a", "b"}),
+        std::vector<std::string>({"c"}), boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1",
+        std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}), boost::indeterminate);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), false);
+  SetOp(&prog, "pool2d", "pool1", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false);
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}), true);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types,
+              unsigned expected_use_mkldnn_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
+  pass->Set("mkldnn_enabled_op_types",
+            new std::unordered_set<std::string>(mkldnn_enabled_op_types));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_mkldnn_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_mkldnn") &&
+          boost::get<bool>(op->GetAttr("use_mkldnn"))) {
+        ++use_mkldnn_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count);
+}
+
+TEST(MKLDNNPlacementPass, enable_conv_relu) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
+  MainTest({"conv2d", "relu"}, 3);
+}
+
+TEST(MKLDNNPlacementPass, enable_relu_pool) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({"relu", "pool2d"}, 4);
+}
+
+TEST(MKLDNNPlacementPass, enable_all) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(mkldnn_placement_pass);
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -290,7 +290,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
  extern int                                                               \
      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()

 #define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
 }

+std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
+    const OpKernelType& key) const {
+  auto config_iter = kernel_configs_map_.find(key);
+  std::vector<KernelConfig>* kernel_configs = nullptr;
+  if (config_iter != kernel_configs_map_.end()) {
+    kernel_configs = &(config_iter->second);
+  }
+  return kernel_configs;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeContext ctx(Inputs(), Outputs(), scope);
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                 KernelTypeToString(expected_kernel_key));
  }

+  std::vector<KernelConfig>* kernel_configs =
+      GetKernelConfig(expected_kernel_key);
+
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
  auto* transfer_scope =
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
+  kernel_iter->second(
+      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));

  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -184,12 +185,30 @@ class OperatorBase {
                       const platform::Place& place) const = 0;
 };

+#ifdef PADDLE_WITH_CUDA
+using KernelConfig = boost::variant<
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
+#else
+using KernelConfig = boost::variant<boost::blank>;
+#endif
+
+using OpKernelConfigsMap =
+    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
+                       OpKernelType::Hash>;
+
 class ExecutionContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
                   const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx)
-      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
+                   const RuntimeContext& ctx,
+                   std::vector<KernelConfig>* configs)
+      : op_(op),
+        scope_(scope),
+        device_context_(device_context),
+        ctx_(ctx),
+        kernel_configs_(configs) {}

  const OperatorBase& op() const { return op_; }

@@ -398,11 +417,20 @@ class ExecutionContext {
    return temp_tensor;
  }

+  template <typename T>
+  T& GetKernelConfig(int idx) const {
+    PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx,
+                   "%s selected kernel doesn't have kernel config %lu <= %d",
+                   op_.Type().c_str(), kernel_configs_->size(), idx);
+    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
+  }
+
 private:
  const OperatorBase& op_;
  const Scope& scope_;
  const platform::DeviceContext& device_context_;
  const RuntimeContext& ctx_;
+  mutable std::vector<KernelConfig>* kernel_configs_;
 };

 template <>
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {

  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;

+  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
+
 protected:
  virtual OpKernelType GetKernelTypeForVar(
      const std::string& var_name, const Tensor& tensor,
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
  void TransferInplaceVarsBack(const Scope& scope,
                               const std::vector<std::string>& inplace_vars,
                               const Scope& exec_scope) const;
+
+ protected:
+  mutable OpKernelConfigsMap kernel_configs_map_;
 };

 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+// Not thread-safe. Should be owned per-kernel.
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  int search_times_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -184,9 +184,10 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 ParallelExecutor::ParallelExecutor(
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &bcast_vars,
-    const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
+    const std::string &loss_var_name, Scope *scope,
+    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -216,11 +217,13 @@ ParallelExecutor::ParallelExecutor(
    }
  }

+  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
+
  // FIXME(Yancey1989): parallel graph mode get better performance
  // in GPU allreduce distributed training. Need an elegant way to
  // choice the execution strategy.
-  build_strategy.enable_parallel_graph_ =
-      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+  build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(
+      *temp_owned_graph, exec_strategy, build_strategy);
  if (build_strategy.enable_parallel_graph_)
    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
               "Execution which can get better performance,"
@@ -254,26 +257,32 @@ ParallelExecutor::ParallelExecutor(
  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
    BCastParamsToDevices(bcast_vars);
  }
-  // Startup Program has been run. All local scopes has correct parameters.
+// Startup Program has been run. All local scopes has correct parameters.

-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::unique_ptr<ir::Graph> graph;
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                               member_->local_scopes_, member_->nranks_,
-                               member_->use_cuda_, member_->nccl_ctxs_.get());
+
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_,
+      member_->nccl_ctxs_.get());
 #else
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                               member_->local_scopes_, member_->nranks_,
-                               member_->use_cuda_);
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_);
+
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
  VLOG(10) << "Eager Deletion Threshold "
           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    graph = member_
+                ->PrepareGCAndRefCnts(std::move(temp_owned_graph),
+                                      static_cast<size_t>(max_memory_size))
+                .release();
+  } else {
+    graph = temp_owned_graph.release();
  }

  // Step 3. Create vars in each scope. Passes may also create new vars.
@@ -308,8 +317,7 @@ ParallelExecutor::ParallelExecutor(
    // TODO(Yancey1989): Remove passing in the main_program when
    // allreduce_seq_pass doesn't need it as the attr.
    member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_, main_program,
-        std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_, graph));
 #else
    PADDLE_THROW(
        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
@@ -317,12 +325,10 @@ ParallelExecutor::ParallelExecutor(
  } else {
    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
    } else {
      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
    }
  }

@@ -452,26 +458,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
  }
 }

+ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  delete member_;
+}
+
 bool ParallelExecutor::EnableParallelGraphExecution(
-    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
    const BuildStrategy &build_strategy) const {
  if (!FLAGS_enable_parallel_graph) return false;

  bool enable_parallel_graph = true;
+
+  for (ir::Node *node : graph.Nodes()) {
+    if (node->IsVar() && node->Var()) {
      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-  for (auto &var_desc : main_program.Block(0).AllVars()) {
-    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
+      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
        enable_parallel_graph = false;
+        break;
      }
-  }
-
+    } else if (node->IsOp() && node->Op()) {
      // TODO(Yancey1989): support pserver mode
-  for (auto &op_desc : main_program.Block(0).AllOps()) {
-    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
+      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
        enable_parallel_graph = false;
        break;
      }
    }
+  }

  if (!member_->use_all_reduce_ || !member_->use_cuda_)

@@ -481,13 +496,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
  return enable_parallel_graph;
 }

-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
 }  // namespace framework
 }  // namespace paddle


--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -46,11 +46,11 @@ class ParallelExecutor {
 public:
  explicit ParallelExecutor(const std::vector<platform::Place> &places,
                            const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy);
+                            const BuildStrategy &build_strategy,
+                            ir::Graph *graph);

  ~ParallelExecutor();

@@ -71,7 +71,7 @@ class ParallelExecutor {

 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
+  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy) const;


--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"

+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_utils.h"
+#endif
+
 namespace paddle {

 namespace framework {
@@ -37,10 +41,34 @@ class Tensor {
 #ifdef PADDLE_WITH_MKLDNN

 public:
-  inline mkldnn::memory::format format() const { return format_; }
+  // TODO(jczaja): This is depracted and will be removed
+  inline mkldnn::memory::format format() const {
+    if (layout_ == DataLayout::kMKLDNN) {
+      return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
+    } else {
+      return mkldnn::memory::format::format_undef;
+    }
+  }

-  inline void set_format(const mkldnn::memory::format format) {
-    format_ = format;
+  // TODO(jczaja): This is depracted and will be removed
+  inline void set_format(
+      const mkldnn::memory::format fmt,
+      mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
+    mem_pd_ = paddle::platform::create_prim_desc_from_format(
+        paddle::framework::vectorize2int(dims()), fmt, data_type);
+    layout_ = DataLayout::kMKLDNN;
+  }
+
+  inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
+    return mem_pd_;
+  }
+
+  inline void set_mkldnn_prim_desc(
+      const mkldnn::memory::primitive_desc& mem_pd) {
+    // Internally MKL-DNN is just copying (increasing reference counter)
+    // to shared_ptr. So asignment should be quite cheap
+    mem_pd_ = mem_pd;
+    layout_ = DataLayout::kMKLDNN;
  }

 protected:
@@ -48,12 +76,9 @@ class Tensor {
   * @brief the detail format of memory block which have layout as kMKLDNN
   *
   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
+   *       nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
   */
-
-  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+  mutable mkldnn::memory::primitive_desc mem_pd_;
 #endif

 public:

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -50,8 +50,6 @@ class Scope;
 }  // namespace framework

 namespace operators {
-template <typename T>
-class AlgorithmsCache;

 class CudnnRNNCache;

@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #ifndef _WIN32
    ncclUniqueId, platform::Communicator,
 #endif
-    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
    operators::CudnnRNNCache,
 #endif
    int, float>;

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
      framework::Scope scope;
      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
      p.op.RuntimeInferShape(scope, place_, ctx);
-      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+      p.func(
+          framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
    }
  }


--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -44,8 +44,13 @@ class PreparedOp {
  PreparedOp(const framework::OperatorBase& op,
             const framework::RuntimeContext& ctx,
             framework::OperatorWithKernel::OpKernelFunc func,
-             platform::DeviceContext* dev_ctx)
-      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+             platform::DeviceContext* dev_ctx,
+             std::vector<framework::KernelConfig>* kernel_configs)
+      : op(op),
+        ctx(ctx),
+        func(func),
+        dev_ctx(dev_ctx),
+        kernel_configs(kernel_configs) {}

  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
                            const framework::OperatorWithKernel& op,
@@ -64,8 +69,9 @@ class PreparedOp {

    framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;

-    auto expected_kernel_key = op.GetExpectedKernelType(
-        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    auto expected_kernel_key =
+        op.GetExpectedKernelType(framework::ExecutionContext(
+            op, framework::Scope(), *dev_ctx, ctx, nullptr));
    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

    auto kernel_iter = kernels.find(expected_kernel_key);
@@ -83,7 +89,9 @@ class PreparedOp {
      PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
                   KernelTypeToString(expected_kernel_key));
    }
-    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+    std::vector<framework::KernelConfig>* kernel_configs =
+        op.GetKernelConfig(expected_kernel_key);
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
  }

  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
@@ -92,6 +100,7 @@ class PreparedOp {
  const framework::RuntimeContext& ctx;
  framework::OperatorWithKernel::OpKernelFunc func;
  platform::DeviceContext* dev_ctx;
+  std::vector<framework::KernelConfig>* kernel_configs;
 };

 class OpBase;
@@ -105,23 +114,23 @@ class VarBase {
 public:
  VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}

-  // Owns `var` and `grad`
+  explicit VarBase(bool stop_gradient)
+      : VarBase(new framework::Variable(),
+                stop_gradient ? nullptr : new VarBase(true), stop_gradient) {}
+
  VarBase(framework::Variable* var, VarBase* grad)
+      : VarBase(var, grad, false) {}
+
+ private:
+  VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient)
      : var_desc_(nullptr),
        var_(var),
        grads_(grad),
-        stop_gradient_(false),
-        pre_op_(nullptr),
-        pre_op_out_idx_(-1) {}
-
-  explicit VarBase(bool stop_gradient)
-      : var_desc_(nullptr),
-        var_(new framework::Variable()),
-        grads_(stop_gradient ? nullptr : new VarBase(true)),
        stop_gradient_(stop_gradient),
        pre_op_(nullptr),
        pre_op_out_idx_(-1) {}

+ public:
  virtual ~VarBase() {
    if (var_) {
      delete var_;
@@ -132,11 +141,13 @@ class VarBase {
    }
  }

-  OpBase* PreOp() const { return pre_op_; }
-  int PreOpOutIdx() const { return pre_op_out_idx_; }
+  inline OpBase* PreOp() const { return pre_op_; }
+  inline int PreOpOutIdx() const { return pre_op_out_idx_; }

-  void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; }
-  bool IsStopGradient() const { return stop_gradient_; }
+  inline void SetStopGradient(bool stop_gradient) {
+    stop_gradient_ = stop_gradient;
+  }
+  inline bool IsStopGradient() const { return stop_gradient_; }

  void RunBackward();


--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/imperative/tracer.h"

+#include <set>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -66,8 +68,9 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
  return result;
 }

-void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                   const VarBasePtrMap& outputs, framework::BlockDesc* block,
+std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                                    const VarBasePtrMap& outputs,
+                                    framework::BlockDesc* block,
                                    const platform::Place expected_place,
                                    const bool stop_gradient) {
  std::map<std::string, VarBase*> vars;
@@ -76,6 +79,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
  VLOG(3) << "tracer tracing " << op_desc->Type();
  op_desc->InferShape(*block);
  op_desc->InferVarType(block);
+
  std::unique_ptr<framework::OperatorBase> op_base =
      framework::OpRegistry::CreateOp(*op_desc);

@@ -92,7 +96,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,

      invars.emplace_back(inp->var_);
      vars[inp->var_desc_->Name()] = inp;
-      if (inp->PreOp()) {
+      if (inp->PreOp() && !inp->IsStopGradient()) {
        op->pre_ops_[it.first].push_back(inp->PreOp());
        op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
      } else {
@@ -138,8 +142,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
  op->place_ = GetExpectedPlace(expected_place, inputs);
  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
-  prepared_op.func(framework::ExecutionContext(
-      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
+  prepared_op.func(
+      framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
+                                  prepared_op.ctx, prepared_op.kernel_configs));
+
+  std::set<std::string> vars_saved_for_backward;

  if (!stop_gradient) {
    std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
@@ -160,6 +167,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
            PADDLE_ENFORCE(fwd_var_it != vars.end());
            // Forward inputs or outputs.
            grad_in_vars.push_back(fwd_var_it->second->var_);
+            vars_saved_for_backward.insert(it.first);
          } else {
            VarBase* var = vars[var_it->second];
            if (!var->grads_->var_->IsInitialized()) {
@@ -193,6 +201,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
  }

  op->block_ = block;
+  return vars_saved_for_backward;
 }

 std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
@@ -202,7 +211,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
  op->input_vars_[PyLayer::kFwdInp] = inputs;
  op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
  for (VarBase* inp : inputs) {
-    if (inp->PreOp()) {
+    if (inp->PreOp() && !inp->IsStopGradient()) {
      op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
      op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
    } else {

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <map>
+#include <set>
 #include <string>
 #include <vector>

@@ -43,8 +44,9 @@ class Tracer {

  virtual ~Tracer() {}

-  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             const VarBasePtrMap& outputs, framework::BlockDesc* block,
+  std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
+                              const VarBasePtrMap& outputs,
+                              framework::BlockDesc* block,
                              const platform::Place expected_place,
                              const bool stop_gradient = false);


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
 endif()

 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+add_subdirectory(benchmark)
--- a/paddle/fluid/operators/alloc_continuous_space_op.cc
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
+
+template <typename DeviceContext, typename T>
+class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &in_var_names = context.Inputs("Input");
+    auto &out_var_names = context.Outputs("Output");
+    auto &in_vars = context.MultiInputVar("Input");
+    auto out_vars = context.MultiOutputVar("Output");
+
+    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size());
+
+    for (size_t i = 0; i < in_var_names.size(); ++i) {
+      // Only support LoDTensor
+      PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,",
+                              in_var_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,",
+                              out_var_names[i]);
+      PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensor>());
+      PADDLE_ENFORCE(out_vars[i]->IsType<framework::LoDTensor>());
+    }
+
+    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
+
+    if (context.Attr<bool>("check_name")) {
+      for (size_t i = 0; i < in_var_names.size(); ++i) {
+        PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]);
+      }
+    } else {
+      // Init the output as input
+      for (size_t i = 0; i < in_tensors.size(); ++i) {
+        out_vars[i]->GetMutable<framework::LoDTensor>()->Resize(
+            in_tensors[i]->dims());
+      }
+    }
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    // Get numel and dtype
+    size_t numel = 0;
+    auto dtype = kDefaultDtype;
+    GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype);
+
+    // Alloc the continuous space
+    auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
+    fused_tensor->Resize(framework::make_ddim({static_cast<int64_t>(numel)}))
+        .mutable_data(context.GetPlace(), dtype);
+
+    // Init the continuous space
+    auto out_tensors = context.MultiOutput<framework::LoDTensor>("Output");
+    int64_t offset = 0;
+    if (context.Attr<bool>("copy_data")) {
+      for (size_t i = 0; i < in_var_names.size(); ++i) {
+        int64_t len = out_tensors[i]->numel();
+        auto sub_tensor = fused_tensor->Slice(offset, offset + len);
+        offset += len;
+        framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
+                              &sub_tensor);
+      }
+    } else if (context.Attr<bool>("set_constant")) {
+      math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, fused_tensor,
+                   static_cast<T>(context.Attr<float>("constant")));
+    }
+
+    // Make the outputs point to the continuous space.
+    offset = 0;
+    for (size_t i = 0; i < out_tensors.size(); ++i) {
+      int64_t len = out_tensors[i]->numel();
+      auto dim = out_tensors[i]->dims();
+      out_tensors[i]
+          ->ShareDataWith(fused_tensor->Slice(offset, offset + len))
+          .Resize(dim);
+      offset += len;
+      VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
+               << ") ,dim:(" << dim << ")"
+               << " Address: " << out_tensors[i]->data<void>();
+    }
+  }
+
+  void GetMemSizeAndDtype(
+      const std::vector<const framework::LoDTensor *> &lod_tensors,
+      const std::vector<std::string> var_names, size_t *numel,
+      framework::proto::VarType::Type *dtype) const {
+    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    *numel = 0;
+    for (size_t i = 0; i < var_names.size(); ++i) {
+      PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
+                     var_names[i]);
+
+      auto p_dtype = lod_tensors[i]->type();
+      if (*dtype == kDefaultDtype) {
+        PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.",
+                          var_names[i], kDefaultDtype);
+        *dtype = p_dtype;
+      }
+      PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal.");
+
+      auto size = lod_tensors[i]->numel();
+      PADDLE_ENFORCE_GT(size, 0);
+      VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
+               << lod_tensors[i]->dims() << ")";
+      *numel += size;
+    }
+  }
+};
+
+class AllocContinuousSpaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(vector<LoDTensor>) The input tensors of"
+             " alloc_continuous_space operator.")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(vector<LoDTensor>) The output "
+              "tensors of alloc_continuous_space operator. And the address "
+              "of output tensors are continuous, they are sliced from the "
+              "tensor of FusedOutput.")
+        .AsDuplicable();
+    AddOutput("FusedOutput",
+              "(LoDTensor) The output tensor "
+              "of alloc_continuous_space operator. And the tensors of"
+              " Output is sliced from the tensor of FusedOutput.");
+    AddAttr<bool>("copy_data", "Whether to copy the Input value to Output.")
+        .SetDefault(false);
+    AddAttr<bool>("set_constant",
+                  "Whether to set the Output with a constant value.")
+        .SetDefault(false);
+    AddAttr<float>("constant",
+                   "If set_constant is true, the constant value will be used "
+                   "to set the Output.")
+        .SetDefault(0.0);
+    AddAttr<bool>("check_name",
+                  "Whether to check the name of Input and Output to ensure "
+                  "they are the same separately.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+AllocContinuousSpace Operator.
+
+alloc_continuous_space is used to make the address of Output
+continuous according to the Input. This Op will alloc a big tensor
+according to the tensors of Input, the dtype is the same with those input tensors,
+the size is the sum of those input tensors' numel, and the dim of the big
+tensor is {sum(numel)}. And the big tensor is stored in FusedOutput.
+The tensors of Output are sliced from the tensor of FusedOutput.
+Note that, the dtype of Input should be the same, and the dim of Input
+and Output should equal.
+The tensors of Input and Output could be the same or different. And
+alloc_continuous_space allows copying the value of Input to Output, or
+setting the Output with a constant value.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(alloc_continuous_space,
+                  paddle::operators::AllocContinuousSpaceOp,
+                  paddle::operators::AllocContinuousSpaceOpMaker);
+namespace ops = paddle::operators;
+REGISTER_OP_CPU_KERNEL(
+    alloc_continuous_space,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    alloc_continuous_space,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+#endif
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
    auto& dev_ctx = *pool.Get(dev_place);

    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr);

    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");

--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(

  auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
      new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place.get());
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);

  framework::LoD lod;
  lod.push_back(source_level_lod);

--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
+cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
+        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester.h"
+#include <fstream>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+DEFINE_string(op_config_list, "", "Path of op config file.");
+DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
+
+void OpTester::Init(const std::string &filename) {
+  Init(OpTesterConfig(filename));
+}
+
+void OpTester::Init(const OpTesterConfig &config) {
+  config_ = config;
+
+  auto &op_desc_info = framework::OpInfoMap::Instance();
+  // Initialize the OpDesc
+  if (op_desc_info.Has(config_.op_type)) {
+    type_ = config_.op_type;
+    op_desc_.SetType(config_.op_type);
+
+    CreateInputVarDesc();
+    CreateOutputVarDesc();
+  } else {
+    LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
+  }
+
+  if (config_.device_id >= 0) {
+    place_ = paddle::platform::CUDAPlace(config_.device_id);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+
+  framework::InitDevices(false);
+  scope_.reset(new paddle::framework::Scope());
+
+  op_ = framework::OpRegistry::CreateOp(op_desc_);
+  CreateVariables(scope_.get());
+}
+
+void OpTester::Run() {
+  if (config_.print_debug_string) {
+    LOG(INFO) << DebugString();
+  }
+
+  // Warm up
+  RunImpl();
+
+  platform::Timer timer;
+  if (config_.profile) {
+    if (platform::is_cpu_place(place_)) {
+      platform::EnableProfiler(platform::ProfilerState::kCPU);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      platform::EnableProfiler(platform::ProfilerState::kAll);
+      platform::SetDeviceId(config_.device_id);
+#else
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+    }
+
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+    platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                              "op_tester_profiler");
+  } else {
+    timer.Start();
+    for (int i = config_.repeat; i > 0; --i) {
+      RunImpl();
+    }
+    timer.Pause();
+  }
+  config_.runtime = timer.ElapsedMS() / config_.repeat;
+  LOG(INFO) << "=== Run " << config_.repeat
+            << " times, latency: " << config_.runtime << " ms ===";
+}
+
+void OpTester::RunImpl() {
+  op_->Run(*scope_, place_);
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  scope_->DropKids();
+}
+
+std::vector<std::string> OpTester::GetOpProtoInputNames() {
+  std::vector<std::string> input_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.inputs_size(); ++i) {
+    const auto &input = proto.inputs(i);
+    input_names.push_back(input.name());
+  }
+  return input_names;
+}
+
+std::vector<std::string> OpTester::GetOpProtoOutputNames() {
+  std::vector<std::string> output_names;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  for (int i = 0; i != proto.outputs_size(); ++i) {
+    const auto &output = proto.outputs(i);
+    output_names.push_back(output.name());
+  }
+  return output_names;
+}
+
+void OpTester::CreateInputVarDesc() {
+  std::vector<std::string> input_names = GetOpProtoInputNames();
+  for (auto &name : input_names) {
+    const OpInputConfig *input = config_.GetInput(name);
+    if (input == nullptr) {
+      LOG(FATAL) << "The input " << name << " of op " << config_.op_type
+                 << " is not correctlly provided.";
+    }
+
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetShape(input->dims);
+
+    op_desc_.SetInput(name, {var_name});
+    input_lods_[var_name] = input->lod;
+  }
+}
+
+void OpTester::CreateOutputVarDesc() {
+  std::vector<std::string> output_names = GetOpProtoOutputNames();
+  for (auto &name : output_names) {
+    std::string var_name = config_.op_type + "." + name;
+    framework::VarDesc *var = Var(var_name);
+    // Need to support more type
+    var->SetType(framework::proto::VarType::LOD_TENSOR);
+    var->SetPersistable(false);
+    var->SetDataType(framework::proto::VarType::FP32);
+
+    op_desc_.SetOutput(name, {var_name});
+  }
+}
+
+framework::VarDesc *OpTester::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new framework::VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+template <typename T>
+void OpTester::SetupTensor(framework::LoDTensor *tensor,
+                           const std::vector<int64_t> &shape, T lower,
+                           T upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
+  if (platform::is_cpu_place(place_)) {
+    for (int i = 0; i < tensor->numel(); ++i) {
+      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+  } else {
+    framework::LoDTensor cpu_tensor;
+    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+                                            platform::CPUPlace());
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+    }
+    TensorCopySync(cpu_tensor, place_, tensor);
+  }
+}
+
+void OpTester::CreateVariables(framework::Scope *scope) {
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    auto *ptr = scope->Var(var->Name());
+    framework::InitializeVariable(ptr, var->GetType());
+    if (var->Persistable()) {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " global, which pointer is " << ptr;
+    } else {
+      VLOG(3) << "Create Variable " << var->Name()
+              << " locally, which pointer is " << ptr;
+    }
+  }
+
+  for (auto &item : input_lods_) {
+    // Allocate memory for input tensor
+    auto &var_name = item.first;
+    VLOG(3) << "Allocate memory for tensor " << var_name;
+
+    auto &var_desc = vars_[var_name];
+    std::vector<int64_t> shape = var_desc->GetShape();
+
+    auto *var = scope->Var(var_name);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
+                       static_cast<float>(1.0));
+
+    VLOG(3) << "Set lod for tensor " << var_name;
+    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    framework::LoD lod;
+    for (size_t i = 0; i < lod_vec.size(); ++i) {
+      lod.push_back(lod_vec[i]);
+    }
+    tensor->set_lod(lod);
+  }
+}
+
+static std::string GenSpaces(int count) {
+  std::stringstream ss;
+  for (int i = 0; i < count; ++i) {
+    ss << "  ";
+  }
+  return ss.str();
+}
+
+std::string OpTester::DebugString() {
+  std::stringstream ss;
+  int count = 0;
+  for (auto &item : vars_) {
+    auto &var = item.second;
+    ss << GenSpaces(count++) << "vars {\n";
+    ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
+    ss << GenSpaces(count++) << "type: {\n";
+    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
+    ss << GenSpaces(count++) << "lod_tensor {\n";
+    ss << GenSpaces(count++) << "tensor {\n";
+    ss << GenSpaces(count) << "data_type: FP32\n";
+    std::vector<int64_t> shape = var->GetShape();
+    for (auto d : shape) {
+      ss << GenSpaces(count) << "dims: " << d << "\n";
+    }
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(--count) << "}\n";
+    ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count++) << "ops {\n";
+  for (auto &name : op_desc_.InputNames()) {
+    ss << GenSpaces(count++) << "inputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  for (auto &name : op_desc_.OutputNames()) {
+    ss << GenSpaces(count++) << "outputs {\n";
+    ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
+    ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
+       << "\"\n";
+    ss << GenSpaces(--count) << "}\n";
+  }
+  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  ss << GenSpaces(--count) << "}\n";
+  return ss.str();
+}
+
+TEST(op_tester, base) {
+  if (!FLAGS_op_config_list.empty()) {
+    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                   FLAGS_op_config_list.c_str());
+    std::vector<OpTesterConfig> op_configs;
+    while (!fin.eof()) {
+      OpTesterConfig config;
+      bool result = config.Init(fin);
+      if (result) {
+        op_configs.push_back(config);
+      }
+    }
+    if (FLAGS_specified_config_id >= 0 &&
+        FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
+      OpTester tester;
+      tester.Init(op_configs[FLAGS_specified_config_id]);
+      tester.Run();
+    } else {
+      for (size_t i = 0; i < op_configs.size(); ++i) {
+        OpTester tester;
+        tester.Init(op_configs[i]);
+        tester.Run();
+      }
+    }
+  } else {
+    OpTester tester;
+    OpTesterConfig config;
+    config.op_type = "elementwise_add";
+    config.inputs.resize(2);
+    config.inputs[0].name = "X";
+    config.inputs[0].dims = {64, 64};
+    config.inputs[1].name = "Y";
+    config.inputs[1].dims = {64, 1};
+    tester.Init(config);
+    tester.Run();
+  }
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+class OpTester {
+ public:
+  OpTester() {}
+
+  void Init(const std::string &filename);
+  void Init(const OpTesterConfig &config);
+
+  void Run();
+
+  std::string DebugString();
+
+ private:
+  std::vector<std::string> GetOpProtoInputNames();
+  std::vector<std::string> GetOpProtoOutputNames();
+
+  void CreateInputVarDesc();
+  void CreateOutputVarDesc();
+
+  framework::VarDesc *Var(const std::string &name);
+  void CreateVariables(framework::Scope *scope);
+
+  template <typename T>
+  void SetupTensor(framework::LoDTensor *input,
+                   const std::vector<int64_t> &shape, T lower, T upper);
+
+  void RunImpl();
+
+ private:
+  OpTesterConfig config_;
+  std::string type_;
+  framework::OpDesc op_desc_;
+  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
+  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  platform::Place place_;
+  std::unique_ptr<framework::Scope> scope_;
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester_config.h"
+#include <fstream>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+static const char kStartSeparator[] = "{";
+static const char kEndSeparator[] = "}";
+static const char kSepBetweenItems[] = ";";
+
+static bool StartWith(const std::string& str, const std::string& substr) {
+  return str.find(substr) == 0;
+}
+
+static bool EndWith(const std::string& str, const std::string& substr) {
+  return str.rfind(substr) == (str.length() - substr.length());
+}
+
+static void EraseEndSep(std::string* str,
+                        std::string substr = kSepBetweenItems) {
+  if (EndWith(*str, substr)) {
+    str->erase(str->length() - substr.length(), str->length());
+  }
+}
+
+void OpInputConfig::ParseDims(std::istream& is) {
+  std::string dims_str;
+  is >> dims_str;
+
+  dims.clear();
+  std::string token;
+  std::istringstream token_stream(dims_str);
+  while (std::getline(token_stream, token, 'x')) {
+    dims.push_back(std::stoi(token));
+  }
+}
+
+void OpInputConfig::ParseLoD(std::istream& is) {
+  std::string lod_str;
+  std::string start_sep =
+      std::string(kStartSeparator) + std::string(kStartSeparator);
+  std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
+
+  std::string sep;
+  is >> sep;
+  if (StartWith(sep, start_sep)) {
+    lod_str += sep;
+    while (!EndWith(sep, end_sep)) {
+      is >> sep;
+      lod_str += sep;
+    }
+  }
+  EraseEndSep(&lod_str);
+  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
+
+  // Parse the lod_str
+  lod.clear();
+  for (size_t i = 1; i < lod_str.length() - 1;) {
+    if (lod_str[i] == '{') {
+      std::vector<size_t> level;
+      while (lod_str[i] != '}') {
+        ++i;
+
+        std::string number;
+        while (lod_str[i] >= '0' && lod_str[i] <= '9') {
+          number += lod_str[i];
+          ++i;
+        }
+        level.push_back(atoi(number.c_str()));
+      }
+      lod.push_back(level);
+    } else if (lod_str[i] == '}') {
+      ++i;
+    }
+  }
+}
+
+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dims" || sep == "dims:") {
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
+      }
+    }
+  }
+}
+
+OpTesterConfig::OpTesterConfig(const std::string& filename) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                 filename.c_str());
+
+  Init(fin);
+}
+
+bool OpTesterConfig::Init(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "op_type" || sep == "op_type:") {
+        is >> op_type;
+      } else if (sep == "device_id" || sep == "device_id:") {
+        is >> device_id;
+      } else if (sep == "repeat" || sep == "repeat:") {
+        is >> repeat;
+      } else if (sep == "profile" || sep == "profile:") {
+        is >> profile;
+      } else if (sep == "print_debug_string" || sep == "print_debug_string:") {
+        is >> print_debug_string;
+      } else if (sep == "input" || sep == "input:") {
+        OpInputConfig input_config(is);
+        inputs.push_back(input_config);
+      } else if (sep == "attrs" || sep == "attrs:") {
+        ParseAttrs(is);
+      } else {
+        if (sep != kEndSeparator) {
+          return false;
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool OpTesterConfig::ParseAttrs(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (true) {
+      std::string key;
+      is >> key;
+      if (key == kEndSeparator) {
+        break;
+      }
+
+      std::string value;
+      is >> value;
+      EraseEndSep(&key, ":");
+      EraseEndSep(&value);
+
+      attrs[key] = value;
+    }
+  }
+  return true;
+}
+
+const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].name == name) {
+      return &inputs[i];
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <istream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+struct OpInputConfig {
+  OpInputConfig() {}
+  explicit OpInputConfig(std::istream& is);
+
+  void ParseDims(std::istream& is);
+  void ParseLoD(std::istream& is);
+
+  std::string name;
+  std::vector<int64_t> dims;
+  std::vector<std::vector<size_t>> lod;
+};
+
+struct OpTesterConfig {
+  OpTesterConfig() {}
+  explicit OpTesterConfig(const std::string& filename);
+
+  bool Init(std::istream& is);
+
+  bool ParseAttrs(std::istream& is);
+
+  const OpInputConfig* GetInput(const std::string& name);
+
+  std::string op_type;
+  std::vector<OpInputConfig> inputs;
+  std::unordered_map<std::string, std::string> attrs;
+  int device_id{-1};  // CPU: -1
+  int repeat{1};
+  int profile{0};
+  int print_debug_string{0};
+  double runtime{0.0};
+};
+
+}  // namespace benchmark
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+using framework::AlgorithmsCache;

 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          workspace_size_limit, &algo));
      VLOG(3) << "cuDNN forward algo " << algo;
    } else if (exhaustive_search && (!half_float)) {
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-        algo_cache =
-            ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-      }
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
      cudnn_workspace =
          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
              framework::make_ddim(
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
              dev_ctx);
      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());

-      algo = algo_cache->GetAlgorithm(
+      algo = algo_cache.GetAlgorithm(
          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
            int returned_algo_count;
            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
      if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
-          data_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        } else {
-          data_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdDataAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
-        }
+        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>& data_algo_cache =
+            ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>(
+                0);

-        data_algo = data_algo_cache->GetAlgorithm(
+        data_algo = data_algo_cache.GetAlgorithm(
            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
              int returned_algo_count;
              std::array<cudnnConvolutionBwdDataAlgoPerf_t,
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
      if (exhaustive_search) {
-        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
-        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
-          f_algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        } else {
-          f_algo_cache =
-              const_cast<framework::Scope&>(ctx.scope())
-                  .Var(kCUDNNBwdFilterAlgoCache)
-                  ->GetMutable<
-                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
-        }
+        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>& f_algo_cache =
+            ctx.GetKernelConfig<
+                AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>(1);

-        filter_algo = f_algo_cache->GetAlgorithm(
+        filter_algo = f_algo_cache.GetAlgorithm(
            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
              int returned_algo_count;
              std::array<cudnnConvolutionBwdFilterAlgoPerf_t,

--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cudnn_helper.h"

 DECLARE_uint64(conv_workspace_size_limit);
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif

-template <typename TAlgorithm>
-class AlgorithmsCache {
- public:
-  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
-  // Caches the best algorithm for a given
-  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
-
-  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
-                          std::function<TAlgorithm()> gen_func);
-
- private:
-  std::unordered_map<int64_t, TAlgorithm> hash_;
-  std::mutex mutex_;
-
-  int search_times_;
-};
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  int64_t seed = 0;
-  // Hash all of the inputs, use to try and look up a previously
-  // discovered algorithm, or fall back to generating a new one.
-  std::hash<int64_t> hashFn;
-  // do hash like boost
-  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-  for (const auto num : dims1) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-
-  for (const auto num : dims2) {
-    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
-  }
-
-  for (const auto num : strides) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 2;
-  }
-
-  for (const auto num : paddings) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 3;
-  }
-
-  for (const auto num : dilations) {
-    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
-            (seed >> 2) + 4;
-  }
-
-  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
-          (seed << 6) + (seed >> 2) + 5;
-
-  if (seed == 0) return gen_func();
-
-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
-  }
-  return hash_[seed];
-}
-
-template <typename TAlgorithm>
-TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
-    int64_t area, int search_times, int algorithmFlags,
-    std::function<TAlgorithm()> gen_func) {
-  if (hash_.find(area) != hash_.end()) {
-    return hash_[area];
-  }
-  if (search_times_ < search_times) {
-    auto algo = gen_func();
-    hash_[area] = algo;
-    ++search_times_;
-    return algo;
-  }
-  TAlgorithm algo;
-  int64_t min = static_cast<uint64_t>(INT_MAX);
-  for (const auto& m : hash_) {
-    if (m.first < min) {
-      min = m.first;
-      algo = m.second;
-    }
-  }
-  return algo;
-}
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
 using DataLayout = platform::DataLayout;
+using framework::AlgorithmsCache;
+
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;

@@ -139,37 +141,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
        }
        return fwd_perf_stat[0].algo;
      };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
      int search_times = ctx.Attr<int>("search_times");
      search_times = std::max(
          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      // TODO(dangqingqing): Unify this if-else.
      if (search_times > 0) {
        // The searched algo will be cached by `search_times` times for
        // different input dimension. For other dimensions, select the algo
        // of closest area.
-        auto var_name = ctx.Inputs("AlgoCache")[0];
-        algo_cache =
-            ctx.scope()
-                .FindVar(var_name)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+        algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
                                       search_func);
      } else {
-        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
-        // all conv ops use the same kCUDNNFwdAlgoCache variable.
-        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
-          algo_cache =
-              ctx.scope()
-                  .FindVar(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        } else {
-          // TODO(qingqing) remove const_cast
-          algo_cache =
-              const_cast<framework::Scope*>(ctx.scope().parent())
-                  ->Var(kCUDNNFwdAlgoCache)
-                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
-        }
-        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
+        algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings,
                                       dilations, 0, search_func);
      }
      VLOG(3) << "choose algo " << algo;

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>

 #ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_MKLDNN
@@ -80,6 +81,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
      framework::OpKernelType::kDefaultCustomizedTypeValue;
  framework::LibraryType library{framework::LibraryType::kPlain};
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  auto input_data_type = ctx.Input<Tensor>("Input")->type();
  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout = framework::StringToDataLayout(data_format);

@@ -93,11 +95,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
      platform::CanMKLDNNBeUsed(ctx)) {
    library = framework::LibraryType::kMKLDNN;
    layout = framework::DataLayout::kMKLDNN;
-    customized_type_value = kConvMKLDNNFP32;
+    customized_type_value =
+        (input_data_type == framework::DataTypeTrait<int8_t>::DataType ||
+         input_data_type == framework::DataTypeTrait<uint8_t>::DataType)
+            ? kConvMKLDNNINT8
+            : kConvMKLDNNFP32;
  }
 #endif

-  auto input_data_type = ctx.Input<Tensor>("Input")->type();
  if (input_data_type != framework::proto::VarType::INT8 &&
      input_data_type != framework::proto::VarType::UINT8) {
    auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
@@ -109,8 +114,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                      "float16 can only be used when CUDNN is used");
  }

-  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+  auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                      library, customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+  // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
+  // to false. It should be fixed and then here should only create if library
+  // is kCUDNN.
+  if (configs.empty()) {
+    std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p(
+        new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
+    configs.push_back(p);
+  }
+#endif
+  return type;
 }

 void Conv2DOpMaker::Make() {
@@ -410,9 +427,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
  }
 #endif

-  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
                                      ctx.GetPlace(), layout_, library_,
                                      customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  if (library_ == framework::LibraryType::kCUDNN) {
+    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+    if (configs.empty()) {
+      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
+          p(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
+      configs.push_back(p);
+
+      std::shared_ptr<
+          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
+          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
+      configs.push_back(p2);
+    }
+  }
+#endif
+  return type;
 }

 class Conv2dGradMaker : public framework::SingleGradOpDescMaker {

--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -32,14 +32,23 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    int rank = x_dims.size();
    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
                      "Input(X) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(label_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
                        framework::slice_ddim(label_dims, 0, rank - 1),
                        "Input(X) and Input(Label) shall have the same shape "
                        "except the last dimension.");
+    }
    if (ctx->Attrs().Get<bool>("soft_label")) {
+      if (check) {
        PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
                          "If Attr(soft_label) == true, the last dimension of "
                          "Input(X) and Input(Label) should be equal.");
+      }
    } else {
      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
                        "If Attr(softLabel) == false, the last dimension of "
@@ -82,6 +91,14 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                      "Input(Y@Grad) and Input(X) should have the same rank.");
    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
                      "Input(Label) and Input(X) should have the same rank.");
+
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(label_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
                        framework::slice_ddim(label_dims, 0, rank - 1),
                        "The Input(X) and Input(Label) should have the same "
@@ -90,12 +107,16 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                        framework::slice_ddim(dy_dims, 0, rank - 1),
                        "The Input(X) and Input(Y@Grad) should have the same "
                        "shape except the last dimension.");
+    }
    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
                      "The last dimension of Input(Y@Grad) should be 1.");
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+      if (check) {
+        PADDLE_ENFORCE_EQ(
+            x_dims[rank - 1], label_dims[rank - 1],
            "When Attr(soft_label) == true, the last dimension of "
            "Input(X) and Input(Label) should be equal.");
+      }
    } else {
      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
                        "When Attr(soft_label) == false, the last dimension of "

--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -140,9 +140,6 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
              "Scales of the history data batch, "
              "will apply to output when training")
        .AsIntermediate();
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
    AddComment(R"DOC(
 Data Normalization.


--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -172,6 +172,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
        framework::make_ddim({1, static_cast<int>(variances.size())}),
        ctx.GetPlace());
    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
    for (size_t i = 0; i < variances.size(); ++i) {
      var_et(0, i) = variances[i];
    }
@@ -181,8 +185,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    vars->Resize({box_num, static_cast<int>(variances.size())});

    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));

+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+    for (int i = 0; i < box_num; ++i) {
+      for (int j = 0; j < variances.size(); ++j) {
+        e_vars(i, j) = variances[j];
+      }
+    }
    vars->Resize(var_dim);
  }
 };  // namespace operators

--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
                   "The ignore threshold to ignore confidence loss.")
        .SetDefault(0.7);
    AddComment(R"DOC(
-         This operator generate yolov3 loss by given predict result and ground
+         This operator generates yolov3 loss based on given predict result and ground
         truth boxes.
         
         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor box.

-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
+         should be as follows:

         $$
-         b_x = \sigma(t_x) + c_x
-         b_y = \sigma(t_y) + c_y
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
         b_w = p_w e^{t_w}
+         $$
+         $$
         b_h = p_h e^{t_h}
         $$

-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         In the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.

         As for confidence score, it is the logistic regression value of IoU between
         anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
         thresh, the confidence score loss of this anchor box will be ignored.

         Therefore, the yolov3 loss consist of three major parts, box location loss,
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {

         In order to trade off box coordinate losses between big boxes and small 
         boxes, box coordinate losses will be mutiplied by scale weight, which is
-         calculated as follow.
+         calculated as follows.

         $$
         weight_{box} = 2.0 - t_w * t_h
         $$

-         Final loss will be represented as follow.
+         Final loss will be represented as follows.

         $$
         loss = (loss_{xy} + loss_{wh}) * weight_{box}

--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -77,8 +77,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
      } else {
        functor.RunMidWise(n, pre, post);
      }
-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(x->format());
+      z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc());
    } else {
      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
                         x->format() != memory::format::format_undef,
@@ -116,7 +115,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);

      // create mkldnn memory for dst
-      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+      auto dst_mem_pd = sum_pd.dst_primitive_desc();
+      memory dst_memory = memory(dst_mem_pd, z_data);

      std::vector<primitive::at> inputs;
      inputs.push_back(srcs[0]);
@@ -129,9 +129,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
      pipeline.push_back(sum_prim);
      stream(stream::kind::eager).submit(pipeline).wait();

-      z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(
-          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+      z->set_mkldnn_prim_desc(dst_mem_pd);
    }
  }
 };
@@ -152,24 +150,19 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
    auto* out = dout;
    auto *x = dout, *y = dout;

-    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
-      in->set_layout(DataLayout::kMKLDNN);
-      in->set_format(out->format());
-    };
-
    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
      if (dx->dims() == dy->dims()) {
        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
        if (dx) {
          blas.VCOPY(dout->numel(), dout->data<T>(),
                     dx->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dx, dout);
+          dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
        }

        if (dy) {
          blas.VCOPY(dout->numel(), dout->data<T>(),
                     dy->mutable_data<T>(ctx.GetPlace()));
-          set_mkldnn_format(dy, dout);
+          dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc());
        }
      }
    } else {

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -31,7 +31,7 @@ template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
                  const int num, T* out) {
-    *out = *(std::max_element(in + 0, in + num, Compare<T>()));
+    *out = std::abs(*(std::max_element(in + 0, in + num, Compare<T>())));
  }
 };

@@ -46,10 +46,8 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
    platform::Transform<platform::CPUDeviceContext> trans;
    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
-    auto in_e = framework::EigenVector<T>::Flatten(in);
    auto out_e = framework::EigenVector<T>::Flatten(*out);
-
-    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
  }
 };


--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include <random>
 #include <string>
@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
                  const std::vector<T>& x, const std::vector<T>& yref,
                  const typename jit::SeqPoolTuples<T>::attr_type& attr) {
    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(x.size() % yref.size(), 0);
+    EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
    int w = yref.size();
    std::vector<T> y(w);
    const T* x_data = x.data();

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
      lstm_value.output_value = out_t.data<T>();
      lstm_value.state_value = cell_t.data<T>();
      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      T cell_clip = 0.0;
      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
      lstm_value.prev_state_value = lstm_value.state_value;
    }

@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
      lstm_value.output_value = nullptr;
      lstm_grad.state_active_grad = nullptr;
      int cur_batch_size = bend - bstart;
+      T cell_clip = 0.0;
      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);

      if (n > 0) {
        int pre_h_start = static_cast<int>(batch_starts[n - 1]);

--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE(ctx->HasInput("C0"),
                     "Input(C0) of LSTMP operator should not be null after "
                     "Input(H0) provided.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
    }

    auto b_dims = ctx->GetInputDim("Bias");
@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
              "This LoDTensor is obtained in the forward and used in the "
              "backward.")
        .AsIntermediate();
-    AddOutput("OrderedP0",
-              "(Tensor) the projection of the initial hidden state "
-              "H0. This is a tensor with shape (N x P), where N is the "
-              "batch size and P is the hidden size.")
-        .AsIntermediate();
    AddAttr<bool>("use_peepholes",
                  "(bool, defalut: True) "
                  "whether to enable diagonal/peephole connections.")
@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(bool, defalut: False) "
                  "whether to compute reversed LSTMP.")
        .SetDefault(false);
+    AddAttr<float>("cell_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for cell state tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
+    AddAttr<float>("proj_clip",
+                   "(float, defalut: 0.0) "
+                   "Clip for Tensor for projection tensor when clip value is "
+                   "greater than 0.0")
+        .SetDefault(0.0);
    AddAttr<std::string>(
        "gate_activation",
        "(string, default: sigmoid)"

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -21,17 +22,50 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/transform.h"

 namespace paddle {
 namespace operators {

 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
+using platform::Transform;

 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

+template <typename T>
+class _ClipFunctor {
+ public:
+  explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class _ClipGradFunctor {
+ public:
+  explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                             const framework::Tensor& src,
@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
    auto* bias = ctx.Input<Tensor>("Bias");

    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
    auto* cell_t0 = ctx.Input<Tensor>("C0");

+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
    batch_gate->mutable_data<T>(ctx.GetPlace());
    auto* proj_out = ctx.Output<LoDTensor>("Projection");
@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
    }
    lstmp_value.prev_state_value = nullptr;
    Tensor ordered_c0;
+    Tensor ordered_h0;

    framework::Vector<size_t> order(batch_gate->lod()[2]);

@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
        // Since the batch computing for LSTMP reorders the input sequence
        // according to their length. The initialized hidden state also needs
        // to reorder.
-
-        Tensor ordered_h0;
-        ordered_proj0->mutable_data<T>(ctx.GetPlace());
        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                           &ordered_h0, true);
-        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
-                    ordered_proj0, static_cast<T>(0.0));
-        if (proj_act != math::detail::ActivationType::kIdentity) {
-          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-          ActCompute(cell_act, place, proj0_dev, proj0_dev);
-        }
-        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
                    &gate_t, static_cast<T>(1.0));
      }

@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
      lstmp_value.state_value = cell_t.data<T>();
      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
-          cell_act, cand_act);
+          device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
+          gate_act, cell_act, cand_act);
      lstmp_value.prev_state_value = lstmp_value.state_value;
      blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
                  &proj_t, static_cast<T>(0.0));
@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
      }
+      if (proj_clip && proj_clip > 0.0) {
+        T* x_data = proj_t.data<T>();
+        int64_t numel = proj_t.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), x_data,
+              x_data + numel, x_data,
+              _ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
    }

    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
    auto* proj_out = ctx.Input<LoDTensor>("Projection");
    auto* cell_out = ctx.Input<LoDTensor>("Cell");

+    auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
+    auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
+
    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));

    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
    auto* c0 = ctx.Input<Tensor>("C0");

    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {

      Tensor cur_proj = batch_proj.Slice(bstart, bend);
      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+
+      if (proj_clip && proj_clip > 0.0) {
+        T* dx_data = proj_g.data<T>();
+        T* x_data = cur_proj.data<T>();
+        int64_t numel = proj_g.numel();
+        Transform<DeviceContext> trans;
+        trans(ctx.template device_context<DeviceContext>(), dx_data,
+              dx_data + numel, x_data, dx_data,
+              _ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
+      }
+
      if (proj_act != math::detail::ActivationType::kIdentity) {
        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {

      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
-          gate_act, cell_act, cand_act);
+          cell_clip, gate_act, cell_act, cand_act);

      if (n > 0) {
        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
@@ -431,32 +480,15 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                             &ordered_h0, true);
          if (weight_g) {
-            blas.MatMul(*ordered_proj0, true, gate_g, false,
-                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                        weight_g, static_cast<T>(1.0));
          }
        }
        if (h0 && (h0_g || proj_weight_g)) {
          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          Tensor proj0_g;
-          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
-          proj0_g.mutable_data<T>(ctx.GetPlace());
          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
-                      &proj0_g, static_cast<T>(0.0));
-          if (proj_act != math::detail::ActivationType::kIdentity) {
-            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
-            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
-            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
-                           proj0_g_dev);
-          }
-          if (h0_g) {
-            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
                      &ordered_h0_g, static_cast<T>(0.0));
        }
-          if (proj_weight_g) {
-            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
-                        proj_weight_g, static_cast<T>(1.0));
-          }
-        }
      }
    }


--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,6 +39,7 @@ math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv DEPS cub)
 math_library(im2col)
+math_library(sample_prob)
 math_library(sampler)

 math_library(gru_compute DEPS activation_functions math_function)

--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -32,7 +32,8 @@ namespace detail {

 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frame_size, ActivationType active_node,
+                                     int frame_size, T cell_clip,
+                                     ActivationType active_node,
                                     ActivationType active_gate,
                                     ActivationType active_state) {
  T r_value_in;
@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,

    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);

    value_in[i] = r_value_in;
    value_ig[i] = r_value_ig;
@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                      LstmMetaGrad<T> grad, int frame_size,
-                                      ActivationType active_node,
+                                      T cell_clip, ActivationType active_node,
                                      ActivationType active_gate,
                                      ActivationType active_state) {
  T r_value_in;
@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);

    grad_in[i] = r_grad_in;
    grad_ig[i] = r_grad_ig;
@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,

 template <class T, class Op>
 void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                   int frame_size, ActivationType active_node,
+                                   int frame_size, T cell_clip,
+                                   ActivationType active_node,
                                   ActivationType active_gate,
                                   ActivationType active_state) {
 #ifdef __AVX__
@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,

    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);

    value_in[i] = r_value_in;
    value_ig[i] = r_value_ig;
@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
                                    LstmMetaGrad<T> grad, int frame_size,
-                                    ActivationType active_node,
+                                    T cell_clip, ActivationType active_node,
                                    ActivationType active_gate,
                                    ActivationType active_state) {
 #ifdef __AVX__
@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
-       active_node, active_gate, active_state);
+       &cell_clip, active_node, active_gate, active_state);

    grad_in[i] = r_grad_in;
    grad_ig[i] = r_grad_ig;
@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,

 template <class T, class Op>
 void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                     active_gate, active_state);
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                     active_node, active_gate, active_state);
  } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
-                                       active_gate, active_state);
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
+                                       active_node, active_gate, active_state);
  }
 }

 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, ActivationType active_node,
+                       int frame_size, T cell_clip, ActivationType active_node,
                       ActivationType active_gate,
                       ActivationType active_state) {
  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
-                                      active_gate, active_state);
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
+                                      active_node, active_gate, active_state);
  } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
                                        active_node, active_gate, active_state);
  }
 }

--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -31,7 +31,8 @@ namespace detail {
 */
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batch_size, ActivationType active_node,
+                              int batch_size, T cell_clip,
+                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,

  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
     &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
-     active_node, active_gate, active_state);
+     &cell_clip, active_node, active_gate, active_state);

  value.gate_value[frame_idx] = r_value_in;
  value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
 template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
                               LstmMetaGrad<T> grad, int frame_size,
-                               int batch_size, ActivationType active_node,
+                               int batch_size, T cell_clip,
+                               ActivationType active_node,
                               ActivationType active_gate,
                               ActivationType active_state) {
  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
     &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
     &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
-     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
-     active_gate, active_state);
+     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
+     active_node, active_gate, active_state);

  grad.gate_grad[frame_idx] = r_grad_in;
  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      ActivationType active_node, ActivationType active_gate,
-                      ActivationType active_state) {
+                      T cell_clip, ActivationType active_node,
+                      ActivationType active_gate, ActivationType active_state) {
  dim3 threads;
  dim3 grid;
  if (batch_size == 1) {
@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
  if (batch_size == 1) {
    KeLstmForward<T, Op,
                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
        active_state);
  } else {
    KeLstmForward<T, Op,
                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frame_size, batch_size, active_node, active_gate,
+        op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
        active_state);
  }
 }
@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frame_size, int batch_size,
+                       int frame_size, int batch_size, T cell_clip,
                       ActivationType active_node, ActivationType active_gate,
                       ActivationType active_state) {
  dim3 threads;
@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
  if (batch_size == 1) {
    KeLstmBackward<T, Op,
                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
  } else {
    KeLstmBackward<T, Op,
                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frame_size, batch_size, active_node, active_gate,
-        active_state);
+        op, value, grad, frame_size, batch_size, cell_clip, active_node,
+        active_gate, active_state);
  }
 }


--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -29,7 +29,7 @@ class lstm {
 public:
  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
                             T *prev_state, T *state, T *state_atv, T *output,
-                             T *checkI, T *checkF, T *checkO,
+                             T *checkI, T *checkF, T *checkO, T *cell_clip,
                             ActivationType active_node,
                             ActivationType active_gate,
                             ActivationType active_state) {
@@ -37,6 +37,15 @@ class lstm {
    *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
    *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
    *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
    *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
    *state_atv = activation(*state, active_state);
    *output = (*value_og) * (*state_atv);
@@ -52,7 +61,7 @@ class lstm {
                             __m256 *value_fg, __m256 *value_og,
                             __m256 *prev_state, __m256 *state,
                             __m256 *state_atv, __m256 *output, __m256 *checkI,
-                             __m256 *checkF, __m256 *checkO,
+                             __m256 *checkF, __m256 *checkO, T *cell_clip,
                             ActivationType active_node,
                             ActivationType active_gate,
                             ActivationType active_state) {
@@ -65,6 +74,13 @@ class lstm {
        active_gate);
    *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
                           _mm256_mul_ps(*prev_state, *value_fg));
+
+    if (*cell_clip > 0.0f) {
+      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
+      __m256 max = _mm256_set1_ps(*cell_clip);
+      *state = _mm256_min_ps(max, *state);
+      *state = _mm256_max_ps(min, *state);
+    }
    *value_og = activation(
        _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
    *state_atv = activation(*state, active_state);
@@ -86,15 +102,26 @@ class lstm {
                             T *prev_state, T *prev_state_grad, T *state,
                             T *state_grad, T *state_atv, T *output_grad,
                             T *checkI, T *checkF, T *checkO, T *checkIGrad,
-                             T *checkFGrad, T *checkOGrad,
+                             T *checkFGrad, T *checkOGrad, T *cell_clip,
                             ActivationType active_node,
                             ActivationType active_gate,
                             ActivationType active_state) {
    *grad_og =
        activation((*output_grad) * (*state_atv), *value_og, active_gate);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
      *state_grad +=
          activation((*output_grad) * (*value_og), *state_atv, active_state) +
          (*grad_og) * (*checkO);
+    }
+
    *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
    *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
    *grad_fg =
@@ -117,15 +144,24 @@ class lstm {
      __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
      __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
      __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
-      __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
-      ActivationType active_gate, ActivationType active_state) {
+      __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
+      ActivationType active_node, ActivationType active_gate,
+      ActivationType active_state) {
    *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
                          active_gate);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = _mm256_set1_ps(0.0f);
+      } else {
        *state_grad =
            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
                                     *state_atv, active_state),
                          *state_grad);
-    *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+        *state_grad =
+            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+      }
+    }
    *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
                          active_node);
    *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,

--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext& context,
                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
                      const detail::ActivationType& cand_act) {
    for (int b = 0; b < batch_size; b++) {
      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
-                               cand_act, gate_act, cell_act);
+                               cell_clip, cand_act, gate_act, cell_act);
      value.gate_value += frame_size * 4;
      value.state_value += frame_size;
      value.state_active_value += frame_size;
@@ -45,13 +45,14 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
  static void compute(const platform::CPUDeviceContext& context,
                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
                      const detail::ActivationType& cand_act) {
    for (int b = 0; b < batch_size; b++) {
      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
-                                frame_size, cand_act, gate_act, cell_act);
+                                frame_size, cell_clip, cand_act, gate_act,
+                                cell_act);

      value.gate_value += frame_size * 4;
      value.state_value += frame_size;

--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
@@ -24,12 +24,12 @@ template <class T>
 struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
  static void compute(const platform::CUDADeviceContext& context,
                      LstmMetaValue<T> value, int frame_size, int batch_size,
-                      const detail::ActivationType& gate_act,
+                      T cell_clip, const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
                      const detail::ActivationType& cand_act) {
    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
-                                frame_size, batch_size, cand_act, gate_act,
-                                cell_act);
+                                frame_size, batch_size, cell_clip, cand_act,
+                                gate_act, cell_act);
  }
 };

@@ -37,13 +37,13 @@ template <class T>
 struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
  static void compute(const platform::CUDADeviceContext& context,
                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType& gate_act,
                      const detail::ActivationType& cell_act,
                      const detail::ActivationType& cand_act) {
    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
-                              frame_size, batch_size, cand_act, gate_act,
-                              cell_act);
+                              frame_size, batch_size, cell_clip, cand_act,
+                              gate_act, cell_act);
  }
 };


--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
 public:
  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
-                      int frame_size, int batch_size,
+                      int frame_size, int batch_size, T cell_clip,
                      const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
                      const detail::ActivationType &cand_act);
@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
 public:
  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
-                      const detail::ActivationType &gate_act,
+                      T cell_clip, const detail::ActivationType &gate_act,
                      const detail::ActivationType &cell_act,
                      const detail::ActivationType &cand_act);
 };

--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SampleWithProb<platform::CPUDeviceContext, float>;
+template class SampleWithProb<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <thrust/random.h>
+#include <thrust/sort.h>
+#include <iostream>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__device__ T gpu_adjust_prob(const T prob, const int num_samples,
+                             const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+class GPULogUniformSampler {
+ public:
+  __device__ int64_t Sample(float random, const int range,
+                            const float log_range) const;
+  __device__ float Probability(int64_t value, const float log_range) const;
+};
+
+__device__ int64_t GPULogUniformSampler::Sample(float random, const int range,
+                                                const float log_range) const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  const int64_t value = static_cast<int64_t>(exp(random * log_range)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range;
+}
+
+__device__ float GPULogUniformSampler::Probability(
+    int64_t value, const float log_range) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  return (log((value + 2.0) / (value + 1.0))) / log_range;
+}
+
+template <typename T>
+__global__ void SamplingCondidate(
+    const size_t n, const int num_tries, const int range, const float log_range,
+    const int num_true, const std::size_t num_samples,
+    const int64_t* label_data, int64_t* samples_data, T* probabilities_data) {
+  const int num_sampled_classes = num_true + num_samples;
+
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+  GPULogUniformSampler sampler;
+
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    int col_idx = idx % num_sampled_classes;
+    int row_idx = idx / num_sampled_classes;
+    if (col_idx < num_true) {
+      samples_data[idx] = label_data[row_idx * num_true + col_idx];
+    } else {
+      samples_data[idx] = samples_data[col_idx];
+    }
+    probabilities_data[idx] = sampler.Probability(samples_data[idx], log_range);
+    probabilities_data[idx] =
+        gpu_adjust_prob(probabilities_data[idx], num_samples, num_tries);
+  }
+}
+
+template <typename T>
+int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
+                int64_t* samples_data) {
+  // sample num_samles unique samples for an example, note that they are not
+  // all negative samples
+  std::unordered_set<int64_t> tmp_samples;
+  tmp_samples.clear();
+  int num_tries = 0;
+  int j = 0;
+  while (j < num_samples) {
+    ++num_tries;
+    auto v = sampler.Sample();
+    auto insert_ok = tmp_samples.insert(v).second;
+    if (!insert_ok) {
+      continue;
+    }
+    samples_data[j] = v;
+    ++j;
+  }
+  return num_tries;
+}
+
+template <typename T>
+void GPUSampleWithProb<T>::operator()(
+    const platform::CUDADeviceContext& context, const int seed,
+    const int dict_size, const bool uniq, const std::size_t num_samples,
+    const Tensor* L, Tensor* S, Tensor* P) {
+  // UNDERSTAND: dimension issues
+  const auto lbl_dim = L->dims();
+  const int batch_size = lbl_dim[0];
+  const int num_true = lbl_dim[1];
+  const int num_sampled_classes = num_true + num_samples;
+  framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+  // UNDERSTAND: raw data view
+  const int64_t* label_data = L->data<int64_t>();
+  int64_t* samples_data = S->data<int64_t>();
+  T* probabilities_data = P->data<T>();
+
+  int s_size = num_samples;
+  framework::DDim s_dim{s_size};
+  Tensor s;
+  int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
+
+  math::LogUniformSampler sampler(dict_size, seed);
+
+  int range = dict_size;
+  float log_range = log(range + 1);
+
+  int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
+  VLOG(1) << "num_tries: " << num_tries;
+  PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data,
+                            sizeof(int64_t) * num_samples,
+                            cudaMemcpyHostToDevice));
+
+  int threads = 512;
+  const size_t size = batch_size * num_sampled_classes;
+  int grid = (batch_size * num_sampled_classes + threads - 1) / threads;
+  SamplingCondidate<T><<<grid, threads, 0, context.stream()>>>(
+      size, num_tries, range, log_range, num_true, num_samples, label_data,
+      samples_data, probabilities_data);
+}
+
+template class GPUSampleWithProb<float>;
+template class GPUSampleWithProb<double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+
+/* UNDERSTAND: utility function to adjust probability for unique sampling,
+return whatever as it is if not using unique samping */
+template <typename T>
+static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
+  if (num_samples == num_tries) {
+    return prob * num_samples;
+  } else {
+    return -expm1(num_tries * log1p(-prob));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SampleWithProb {
+ public:
+  void operator()(const DeviceContext& context, const Sampler& sampler,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P) {
+    // UNDERSTAND: dimension issues
+    const auto lbl_dim = L->dims();
+    const int batch_size = lbl_dim[0];
+    const int num_true = lbl_dim[1];
+    const int num_sampled_classes = num_true + num_samples;
+    framework::DDim ret_dim{batch_size, num_sampled_classes};
+
+    // UNDERSTAND: raw data view
+    const int64_t* label_data = L->data<int64_t>();
+    int64_t* samples_data =
+        S->mutable_data<int64_t>(ret_dim, context.GetPlace());
+    T* probabilities_data = P->mutable_data<T>(ret_dim, context.GetPlace());
+
+    // temp sets for unique sampling
+    std::unordered_set<int64_t> tmp_samples;
+    int j = 0;  // column index
+    // add true labels, not that efficient
+    while (j < num_true) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        auto v = label_data[i * num_true + j];
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = sampler.Probability(v);
+      }
+      ++j;
+    }
+
+    // sample num_samles unique samples for an example, note that they are not
+    // all negative samples
+    tmp_samples.clear();
+    int num_tries = 0;
+    while (j < num_sampled_classes) {
+      ++num_tries;
+      auto v = sampler.Sample();
+      auto insert_ok = tmp_samples.insert(v).second;
+      if (!insert_ok) {
+        continue;
+      }
+      auto p = sampler.Probability(v);
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + j;
+        samples_data[samples_index] = v;
+        probabilities_data[samples_index] = p;
+      }
+      ++j;
+    }
+
+    // compute Q(y|x), because of unique sampling, probabilities need to be
+    // adjusted
+    for (int k = 0; k < num_sampled_classes; ++k) {
+      for (int i = 0; i < batch_size; ++i) {
+        auto samples_index = i * num_sampled_classes + k;
+        probabilities_data[samples_index] = adjust_prob(
+            probabilities_data[samples_index], num_samples, num_tries);
+      }
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class GPUSampleWithProb {
+ public:
+  void operator()(const platform::CUDADeviceContext& context, const int seed,
+                  const int dict_size, const bool uniq,
+                  const std::size_t num_samples, const Tensor* L, Tensor* S,
+                  Tensor* P);
+};
+#endif
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -96,8 +96,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,

  std::vector<int> src_tz = framework::vectorize2int(x->dims());

-  auto src_format =
-      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
+  auto src_format = x->format();

  const std::string key = gethash(src_tz, algorithm);
  const std::string key_src_data =
@@ -127,10 +126,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,

  if (p_fwd == nullptr) {
    // create mkldnn memory for input X
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
    auto src_memory = std::shared_ptr<memory>(
-        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+        new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data)));
    // save src_memory to be referred in backward path
    dev_ctx.SetBlob(key_src_mem, src_memory);

@@ -177,8 +174,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
  pipeline.push_back(*p_fwd);
  stream(stream::kind::eager).submit(pipeline).wait();

-  y->set_layout(DataLayout::kMKLDNN);
-  y->set_format(GetMKLDNNFormat(*dst_memory));
+  y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());
 }

 template <typename T>
@@ -196,9 +192,6 @@ void eltwise_grad(const framework::ExecutionContext &ctx,

  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());

-  auto diff_y_format =
-      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
-
  const std::string key = gethash(diff_dst_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
@@ -210,8 +203,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
  const std::string key_fwd_pd =
      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
-  const std::string key_with_layouts =
-      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_with_layouts = key + std::to_string(*p_src_layout) +
+                                       "-" + std::to_string(diff_y->format());
  const std::string key_diff_src_mem =
      key_with_layouts + "@eltwise_diff_src_mem";
  const std::string key_diff_dst_mem =
@@ -225,7 +218,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
  PADDLE_ENFORCE(src_memory != nullptr,
                 "Fail to find src_memory in device context");
-  src_memory->set_data_handle(*p_src_data.get());
+  src_memory->set_data_handle(*p_src_data);

  std::shared_ptr<memory> diff_src_memory;

@@ -234,10 +227,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx,

  if (p_grad == nullptr) {
    // create mkldnn memory for input diff_y
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
    auto diff_dst_memory = std::shared_ptr<memory>(
-        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+        new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)));
    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);

    // retrieve eltwise primitive desc from device context
@@ -281,8 +272,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
  pipeline.push_back(*p_grad);
  stream(stream::kind::eager).submit(pipeline).wait();

-  diff_x->set_layout(DataLayout::kMKLDNN);
-  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
+  diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
 }

 template <typename T, mkldnn::algorithm algorithm>

--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -206,17 +206,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;

    // create mkldnn memory from input x tensor
-    mkldnn::memory::format input_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), x->format());

    // keys for backward pass
    const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, global_stats, input_format,
+        src_tz, epsilon, flags, global_stats, x->format(),
        ctx.op().Output("SavedMean"));
    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";

-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
+    auto user_src_md = x->get_mkldnn_prim_desc().desc();

    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -230,8 +227,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
                                   key);

-    auto src_memory =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
+    auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(),
+                                               to_void_cast(x_data));

    // crate mkldnn memory for weights(scale/shift)
    auto scaleshift_memory =
@@ -265,8 +262,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          variance_memory, false);
    }

-    y->set_layout(DataLayout::kMKLDNN);
-    y->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc());

    std::vector<mkldnn::primitive> pipeline;
    pipeline.push_back(*batch_norm_p);
@@ -336,9 +332,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {

    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;

-    mkldnn::memory::format dst_format =
-        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
-
    mkldnn::memory::format input_format =
        platform::MKLDNNFormatForSize(src_tz.size(), x->format());

@@ -346,14 +339,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {

    // keys from forward pass
    const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, false, input_format,
+        src_tz, epsilon, flags, false, x->format(),
        ctx.op().Input("SavedMean"));
    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";

    // keys for primitives reuse
    const std::string key_with_hash =
        key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false,
-                                              input_format);
+                                              x->format());
    const std::string key_batch_norm_bwd_p =
        key_with_hash + "@batch_norm_bwd_p";
    const std::string key_batch_norm_src_mem_p =
@@ -373,9 +366,8 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {

    primitive reorder_diff_dst;
    bool is_diff_dst_reordered = false;
-    auto user_diff_dst_memory = memory(
-        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
-        to_void_cast(diff_y_data));
+    auto user_diff_dst_memory =
+        memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data));

    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -459,10 +451,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory);

      // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
-                             .desc()
-                             .data.format);
+      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
    } else {
      // primitives already exist
      UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data));
@@ -487,10 +476,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      }

      // set layout/format of output tensors
-      diff_x->set_layout(DataLayout::kMKLDNN);
-      diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc()
-                             .desc()
-                             .data.format);
+      diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc());
    }

    // execute optional reorder and batch_norm backward primitive

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -47,11 +47,6 @@ static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
  return mem_prim_desc;
 }

-static mkldnn::memory::format GetDstMemFormat(
-    const concat::primitive_desc& concat_pd) {
-  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
-}
-
 static platform::CPUPlace GetCpuPlace(
    const paddle::framework::ExecutionContext& ctx) {
  auto place = ctx.GetPlace();
@@ -139,8 +134,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
    stream(stream::kind::eager).submit({concat}).wait();

-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetDstMemFormat(concat_pd));
+    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
    auto* output = ctx.Output<Tensor>("Output");

-    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
-                       input->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Input tensor");
-    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
-                       filter->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    std::vector<primitive> pipeline;

-    auto src_format = input->format();
+    // For convolution with groups we need to recreate primitive descriptor
+    // as Paddle tensor is not having group dims while mkldnn treats
+    // group as another dimensions
+    mkldnn::memory::primitive_desc user_weights_mpd =
+        filter->get_mkldnn_prim_desc();
+    if (g > 1) {
      mkldnn::memory::format weights_format =
          GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
      auto user_weights_md = platform::MKLDNNMemDesc(
          {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
+      user_weights_mpd =
+          mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
+    }

    /* create memory descriptor for convolution without specified format
     * ('any') which lets a primitive (convolution in this case) choose
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto chosen_memory_format =
        platform::data_format_to_memory_format(data_format);

-    weights_format = mkldnn::memory::format::any;
+    mkldnn::memory::format weights_format = mkldnn::memory::format::any;
    // Check the format for user's special output
    if (chosen_memory_format != mkldnn::memory::format::any) {
      if (is_conv3d) {
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);

    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
+        user_weights_mpd, to_void_cast<T>(filter_data));

    // create reorder primitive if the input format is not the preferred one
    auto src_memory_p =
@@ -281,8 +282,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    pipeline.push_back(*conv_p);
    stream(stream::kind::eager).submit(pipeline).wait();

-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
  }
  void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -947,8 +947,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      // push primitive to stream and wait until it's executed
      pipeline.push_back(*conv_bwd_weights_p);

-      filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
+      filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
    }

    if (input_grad) {
@@ -971,8 +971,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {

      pipeline.push_back(*conv_bwd_data_p);

-      input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+      input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc());
    }
    stream(stream::kind::eager).submit(pipeline).wait();
  }
@@ -990,12 +989,12 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,

 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                    ::paddle::platform::CPUPlace, U8,
-                                    ops::kConvMKLDNNFP32,
+                                    ops::kConvMKLDNNINT8,
                                    ops::ConvMKLDNNOpKernel<uint8_t, float>);

 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                    ::paddle::platform::CPUPlace, S8,
-                                    ops::kConvMKLDNNFP32,
+                                    ops::kConvMKLDNNINT8,
                                    ops::ConvMKLDNNOpKernel<int8_t, float>);

 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,

--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -221,8 +221,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    pipeline.push_back(*conv_p);
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();

-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+    output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc());
  }

 private:

--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {

    // The format of output is set as the mkldnn's format
    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
-    tensor->set_layout(DataLayout::kMKLDNN);
-    tensor->set_format(mkldnn::memory::format::oihw);
+    // TODO(jczaja): Remove this hack after checking performance on block layout
+
+    auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(tensor->dims()),
+        mkldnn::memory::format::oihw);
+    tensor->set_mkldnn_prim_desc(tensor_mem_pd);
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -81,10 +81,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
    e_mid = e_mid.constant(k);

-    auto dims = paddle::framework::vectorize2int(x->dims());
-
-    auto src_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, x->format());
+    auto src_md = x->get_mkldnn_prim_desc().desc();

    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                  mkldnn::lrn_across_channels,
@@ -94,7 +91,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                  beta,
                                                  k};

-    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto src_memory_pd = x->get_mkldnn_prim_desc();

    if (!is_test) {
      const std::string key = ctx.op().Output("Out");
@@ -111,16 +108,15 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      src_memory->set_data_handle(
          static_cast<void*>(const_cast<T*>(input_data)));

-      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
-                                       static_cast<void*>(output_data));
+      auto dst_memory_pd = forward_pd->dst_primitive_desc();
+      auto dst_memory =
+          mkldnn::memory(dst_memory_pd, static_cast<void*>(output_data));
      auto workspace_memory = insert_to_context<mkldnn::memory>(
          key_workspace_memory, dev_ctx,
          forward_pd->workspace_primitive_desc());

      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
-
-      out->set_layout(framework::DataLayout::kMKLDNN);
-      out->set_format(platform::GetMKLDNNFormat(dst_memory));
+      out->set_mkldnn_prim_desc(dst_memory_pd);
    } else {
      auto forward_pd =
          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -128,13 +124,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
      auto workspace_memory =
          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      auto dst_memory_pd = forward_pd.dst_primitive_desc();
      auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
                                       static_cast<void*>(output_data));

      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
-
-      out->set_layout(framework::DataLayout::kMKLDNN);
-      out->set_format(platform::GetMKLDNNFormat(dst_memory));
+      out->set_mkldnn_prim_desc(dst_memory_pd);
    }
  }
 };

--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"

 namespace paddle {
 namespace operators {
@@ -29,23 +30,23 @@ using mkldnn::stream;
 using platform::to_void_cast;

 // Generate keys for storing/retriving primitives for this operator
-// TODO(jczaja): Make hashing function more optimial
-static std::string gethash(const memory::dims& input_dims,
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const memory::dims& input_dims,
                      const std::string& pooling_type,
                      const std::vector<int>& ksize,
                      const std::vector<int>& strides,
                      const std::vector<int>& paddings,
-                           const memory::data_type& dt,
-                           const std::string& suffix) {
-  auto dims2str = [](const memory::dims& operand_dims) {
-    std::string dstr = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      dstr += std::to_string(operand_dims[i]) + "-";
-    }
-    return dstr;
-  };
-  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
-         dims2str(paddings) + std::to_string(dt) + pooling_type + suffix;
+                      const memory::data_type& dt, const std::string& suffix) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
+  platform::MKLDNNHandler::AppendKey(&key, pooling_type);
+  platform::MKLDNNHandler::AppendKeyVec(&key, ksize);
+  platform::MKLDNNHandler::AppendKeyVec(&key, strides);
+  platform::MKLDNNHandler::AppendKeyVec(&key, paddings);
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  platform::MKLDNNHandler::AppendKey(&key, suffix);
+  return key;
 }

 static inline int ComputeCeiledOutput(int input_size, int kernel_size,
@@ -114,7 +115,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    mkldnn::memory::data_type dt =
        paddle::framework::ToMKLDNNDataType(input->type());
-    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
+    const std::string key = CreateKey(ctx, src_tz, pooling_type, ksize, strides,
                                      paddings, dt, ctx.op().Output("Out"));
    const std::string key_pool_p = key + "@pool_p";
    const std::string key_pool_pd = key + "@pool_pd";
@@ -198,7 +199,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    }

    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
+    std::vector<mkldnn::primitive> pipeline{*pool_p};
    stream(stream::kind::eager).submit(pipeline).wait();

    output->set_layout(DataLayout::kMKLDNN);
@@ -294,7 +295,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    // Get an unique name from "argument" name of "Out" variable
    // This name will be used as key when referring info from device context
    const std::string key =
-        gethash(diff_src_tz, pooling_type, ksize, strides, paddings,
+        CreateKey(ctx, diff_src_tz, pooling_type, ksize, strides, paddings,
                  memory::data_type::f32, ctx.op().Input("Out"));
    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
@@ -367,8 +368,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);

      pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
-          *(diff_src_memory));
+          pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory);
      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);

    } else {
@@ -404,7 +404,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    if (is_diff_dst_reordered) {
      pipeline.push_back(reorder_diff_dst);
    }
-    pipeline.push_back(*(pool_bwd_p.get()));
+    pipeline.push_back(*pool_bwd_p);
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();

    in_x_grad->set_layout(DataLayout::kMKLDNN);

--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                   "Fail to find softmax primitive in device context");
    if (softmax_p == nullptr) {
      softmax_p = std::make_shared<mkldnn::softmax_forward>(
-          *(softmax_pd_.get()),
-          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *softmax_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
      dev_ctx_.SetBlob(prim_key, softmax_p);
    } else {
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
                   "Fail to find softmax backward primitive in device context");
    if (softmax_bwd_p == nullptr) {
      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
-          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
-          *(diff_src_memory_p.get()));
+          *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p,
+          *diff_src_memory_p);
      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
    } else {
      is_reusing_ = true;
@@ -159,6 +158,14 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
    auto softmax_p =
        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);

+    // We cannot use softmax_dst_memory_p to get prim desc as
+    // it contains flattened dims (2D) while output tensor can
+    // have 2,3,4+ dims
+    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(output->dims()),
+        mkldnn::memory::format::blocked);
+    output->set_mkldnn_prim_desc(output_mem_pd);
+
    std::vector<primitive> pipeline{
        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
    stream(stream::kind::eager).submit(pipeline).wait();

--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -79,15 +79,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

      memory::format input_format = input0.format();

-      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
-                                 input_format == memory::format::nhwc)) {
-        input_format = memory::format::x;
-      }
-      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
-                                 input_format == memory::format::nhwc)) {
-        input_format = memory::format::nc;
-      }
-
      for (int i = 0; i < N; i++) {
        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
                       "all inputs must be all LoDTensors");
@@ -115,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);

      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
-
+      auto dst_mem_pd = sum_pd.dst_primitive_desc();
      std::shared_ptr<memory> dst_mem;
      if (in_place) {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+        dst_mem.reset(new memory(dst_mem_pd));
      } else {
-        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+        dst_mem.reset(new memory(dst_mem_pd, output_data));
      }
      std::vector<mkldnn::primitive::at> inputs;
      for (size_t i = 0; i < srcs_mem.size(); ++i) {
@@ -145,107 +136,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      if (in_place) pipeline.push_back(reorder_prim);
      stream(stream::kind::eager).submit(pipeline).wait();

-      output->set_layout(DataLayout::kMKLDNN);
-      output->set_format(output_format);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
-      std::unique_ptr<framework::SelectedRows> in0;
-      if (in_place) {
-        // If is in_place, we store the input[0] to in0
-        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
-        auto& rows = in_sel0.rows();
-        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
-        in0->mutable_value()->ShareDataWith(in_sel0.value());
-      }
-
-      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
-        if (i == 0 && in0) {
-          return *in0.get();
-        } else {
-          return in_vars[i]->Get<SelectedRows>();
-        }
-      };
-      auto* out = ctx.Output<SelectedRows>("Out");
-      out->mutable_rows()->clear();
-      auto* out_value = out->mutable_value();
-
-      // Runtime InferShape
-      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        first_dim += sel_row.rows().size();
-      }
-
-      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        if (sel_row.rows().size() > 0) {
-          in_dim = framework::vectorize(sel_row.value().dims());
-          break;
-        }
-      }
-
-      if (in_dim.empty()) {
-        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
-      } else {
-        in_dim[0] = static_cast<int64_t>(first_dim);
-      }
-
-      in_dim[0] = static_cast<int64_t>(first_dim);
-
-      out_value->Resize(framework::make_ddim(in_dim));
-
-      out_value->mutable_data<T>(ctx.GetPlace());
-
-      // if all the input sparse vars are empty, no need to
-      // merge these vars.
-      if (first_dim == 0UL) {
-        return;
-      }
-
-      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
-      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
-        auto& sel_row = get_selected_row(i);
-        if (sel_row.rows().size() == 0) {
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
-                offset, out);
-        offset += sel_row.value().numel();
-      }
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
-      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                       "Only support all inputs are TensorArray");
-        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-        for (size_t i = 0; i < in_array.size(); ++i) {
-          if (in_array[i].numel() != 0) {
-            if (i >= out_array.size()) {
-              out_array.resize(i + 1);
-            }
-            if (out_array[i].numel() == 0) {
-              framework::TensorCopy(in_array[i], in_array[i].place(),
-                                    ctx.device_context(), &out_array[i]);
-              out_array[i].set_lod(in_array[i].lod());
-            } else {
-              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
-              auto in = EigenVector<T>::Flatten(in_array[i]);
-              auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
-                                 .eigen_device()) = result + in;
-            }
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      output->set_mkldnn_prim_desc(dst_mem_pd);
+    } else {  // Fallback to naive version
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support
+      SumKernel<CPUDeviceContext, T> reference_kernel;
+      reference_kernel.Compute(ctx);
    }
  }
 };

--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                             mkldnn_engine, key);

    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->format(), platform::to_void_cast<T>(input_data));
+        input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data));
    auto transpose_dst_memory_p =
        handler.AcquireDstMemory(output, ctx.GetPlace());
    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<mkldnn::primitive> pipeline;
    pipeline.push_back(*transpose_p);
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(output->dims()),
+        mkldnn::memory::format::blocked);
+    output->set_mkldnn_prim_desc(output_mem_pd);
  }
 };

@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
                                             mkldnn_engine, key);

-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
+    auto transpose_src_memory_p =
+        handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(),
+                                 platform::to_void_cast<T>(out_grad_data));
    auto transpose_dst_memory_p =
        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<mkldnn::primitive> pipeline;
    pipeline.push_back(*transpose_p);
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    // Transpose did change logical dimensions of Tensor, but reorder does not.
+    // Reorder does change only physical layout eg. format , strides
+    // so we need to create new primitive descriptor with changed logical layout
+    // so it match output shape
+    auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
+        paddle::framework::vectorize2int(x_grad->dims()),
+        mkldnn::memory::format::blocked);
+    x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
  }
 };


--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
                            "be ignored.");  // TODO(Chengduo): Add checker.
                                             // (Currently,
  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("global_pooling",
+  AddAttr<bool>(
+      "global_pooling",
      "(bool, default false) Whether to use the global pooling. "
-                "If global_pooling = true, ksize and paddings will be ignored.")
+      "If global_pooling = true, kernel size and paddings will be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>("strides",
                            "(vector<int>, default {1, 1}), strides(height, "
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
      "paddings",
      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
      "operator."
-      "If global_pooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and kernel size will be ignored.")
      .SetDefault({0, 0});
  AddAttr<bool>(
      "exclusive",
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
      "output height and width. False is the default. If it is set to False, "
      "the floor function will be used.")
      .SetDefault(false);
@@ -262,28 +263,37 @@ Example:
  For exclusive = false:
       $$
       hstart = i * strides[0] - paddings[0]
+       $$
+       $$
       hend = hstart + ksize[0]
+       $$
+       $$
       wstart = j * strides[1] - paddings[1]
+       $$
+       $$
       wend = wstart + ksize[1]
+       $$
+       $$
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
       $$
+
  For exclusive = true:
       $$
       hstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
       hend = min(H, hstart + ksize[0])
+       $$
+       $$
       wstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
       wend = min(W, wstart + ksize[1])
-       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
       $$
-
-  For adaptive = true:
       $$
-      hstart = floor(i * H_{in} / H_{out})
-      hend = ceil((i + 1) * H_{in} / H_{out})
-      wstart = floor(j * W_{in} / W_{out})
-      wend = ceil((j + 1) * W_{in} / W_{out})
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
       $$
+
 )DOC");
 }

@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
  AddAttr<bool>(
      "global_pooling",
      "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, ksize and paddings wille be ignored.")
+      "If global_pooling = true, kernel size and paddings will be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>(
      "strides",
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
      "output height and width. False is the default. If it is set to False, "
      "the floor function will be used.")
      .SetDefault(false);
@@ -393,45 +403,65 @@ Example:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
  For ceil_mode = false:
       $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
       $$
  For ceil_mode = true:
       $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       $$
+
  For exclusive = false:
       $$
       dstart = i * strides[0] - paddings[0]
+       $$
+       $$
       dend = dstart + ksize[0]
+       $$
+       $$
       hstart = j * strides[1] - paddings[1]
+       $$
+       $$
       hend = hstart + ksize[1]
+       $$
+       $$
       wstart = k * strides[2] - paddings[2]
+       $$
+       $$
       wend = wstart + ksize[2]
+       $$
+       $$
       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
       $$
+
  For exclusive = true:
       $$
       dstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
       dend = min(D, dstart + ksize[0])
-  hstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
       hend = min(H, hstart + ksize[1])
+       $$
+       $$
       wstart = max(0, k * strides[2] - paddings[2])
+       $$
+       $$
       wend = min(W, wstart + ksize[2])
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
       $$
-
-  For adaptive = true:
       $$
-  dstart = floor(i * D_{in} / D_{out})
-  dend = ceil((i + 1) * D_{in} / D_{out})
-  hstart = floor(j * H_{in} / H_{out})
-  hend = ceil((j + 1) * H_{in} / H_{out})
-  wstart = floor(k * W_{in} / W_{out})
-  wend = ceil((k + 1) * W_{in} / W_{out})
       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
       $$


--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sample_logits_op.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+
+namespace paddle {
+namespace operators {
+
+class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.");
+    AddInput("Labels",
+             "(Tensor) The ground truth which is a 2-D tensor. Labels is a "
+             "Tensor<int64> with shape [N x NT], where NT is the number of"
+             "true labels for each example.");
+    AddInput("CustomizedSamples",
+             "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+             "NT + S],"
+             " where N is the batch size, NT is the number of true labels "
+             "and S is the number of negtive sample for each example."
+             "The first NT elements of each row should be the same with true "
+             "labels, "
+             "followed by S custom negtive samples. This tensor"
+             "is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddInput(
+        "CustomizedProbabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The tensor has the same shape with CustomSamples,"
+        "and each element represents probability of element in CustomSamples. "
+        "This "
+        "tensor is only used when use_customized_samples is true.")
+        .AsDispensable();
+    AddOutput("Samples",
+              "(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
+              "NT + S]."
+              "The outputs value of sampler, including NT true lables and S "
+              "negetive samples "
+              "for each example. This will be used in"
+              "backward calculation.")
+        .AsIntermediate();
+    AddOutput(
+        "Probabilities",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
+        "The probabilites of sampled positive and negtive labels.")
+        .AsIntermediate();
+    AddOutput("SampledLogits",
+              "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
+              "[N, NT + S]. The outputs value of sampled logits, which will be"
+              "used in backward propagation.")
+        .AsIntermediate();
+    AddOutput(
+        "SampledLabels",
+        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
+        "with shape [N, NT]. The tonsor contains hard labels as input to "
+        " softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
+        " of Sampels are positive lables.");
+    AddAttr<bool>(
+        "use_customized_samples",
+        "An indicator whether to use customized samples with probabilities, if "
+        "True"
+        "the operator will use customized samples and customized probabilities"
+        "otherwise, the operator will generate them by itself.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "uniq",
+        "An indicator whether to sample non-repetitive negtive labels, if True"
+        "the operator will sample negtive labels without replacement."
+        "Otherwise, the operator will sample negtive labels with replacement.")
+        .SetDefault(true);
+    AddAttr<bool>(
+        "remove_accidental_hits",
+        "An indicator whether to remove accidental hits when samples hits true"
+        "labels, the removal is implemented by subtracting the corresponding"
+        "logits by float_max to subpress their softmax to be zero.")
+        .SetDefault(true);
+    AddAttr<int>("num_samples", "The number of negative samples.");
+    AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
+
+    AddComment(R"DOC(
+  """
+  Computes sampled output training logits and labels suitable for implementing
+  sampled softmax.        
+  """
+
+)DOC");
+  }
+};
+
+class SampleLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Samples"),
+                   "Output(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Probabilities"),
+                   "Output(Probabilities) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLogits"),
+                   "Output(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
+                   "Output(SampledLabels) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(
+        logits_dims.size(), 2UL,
+        "The logits of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    const int num_samples = ctx->Attrs().Get<int>("num_samples");
+    const int num_sampled_classes = labels_dims[1] + num_samples;
+    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
+    ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Logits"));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: InferShape for Grad
+class SampleLogitsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Samples"),
+                   "Input(Samples) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
+                   "Input(SampledLogits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
+                   "Input(SampledLogits@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto logit_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "The label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
+                      "The logits should be a 2-D tensor.");
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("SampledLogits")));
+    framework::OpKernelType kt =
+        framework::OpKernelType(data_type, ctx.device_context());
+    return kt;
+  }
+};
+
+// UNDERSTAND: what's the rule for making a GradMaker TODO
+class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("sample_logits_grad");
+    grad_op->SetInput("Logits", Input("Logits"));
+    grad_op->SetInput("Labels", Input("Labels"));
+    grad_op->SetInput("Samples", Output("Samples"));
+    grad_op->SetInput("SampledLogits", Output("SampledLogits"));
+    grad_op->SetInput(framework::GradVarName("SampledLogits"),
+                      OutputGrad("SampledLogits"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sample_logits, ops::SampleLogitsOp, ops::SampleLogitsOpMaker,
+                  ops::SampleLogitsGradMaker);
+REGISTER_OPERATOR(sample_logits_grad, ops::SampleLogitsOpGrad);
+REGISTER_OP_CPU_KERNEL(sample_logits, ops::SampleLogitsKernel<float>,
+                       ops::SampleLogitsKernel<double>);
+REGISTER_OP_CPU_KERNEL(sample_logits_grad, ops::SampleLogitsGradKernel<float>,
+                       ops::SampleLogitsGradKernel<double>);
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/sample_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+__global__ void GPUTakeAlongD1(size_t size, const int batch_size,
+                               const int array_slice_size,
+                               const int idx_slice_size, const T* p_array,
+                               const int64_t* p_index, T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    auto array_index = p_index[idx];
+    p_value[idx] = p_array[i * array_slice_size + array_index];
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+__global__ void GPUPutAlongD1(size_t size, const int batch_size,
+                              const int array_slice_size,
+                              const int idx_slice_size, T* p_array,
+                              const int64_t* p_index, const T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  // size == batch_size
+  for (; idx < size; idx += step_size) {
+    int i = idx;
+    for (int j = 0; j < idx_slice_size; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * idx_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: set label as 0,1,...,num_true-1
+template <typename T>
+__global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    p_array[idx] = idx % num_true;
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+__global__ void gpu_compute_remove_accidental_hits(const int size,
+                                                   const int num_true,
+                                                   const int idx_slice_size,
+                                                   const int64_t* p_index,
+                                                   T* p_value) {
+  const auto value_slice_size = idx_slice_size;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = blockDim.x * gridDim.x;
+
+  for (; idx < size; idx += step_size) {
+    int i = idx / idx_slice_size;
+    if (idx % idx_slice_size < num_true) continue;
+    for (int j = 0; j < num_true; ++j) {
+      const auto true_idx = i * idx_slice_size + j;
+      if (p_index[true_idx] == p_index[idx]) {
+        p_value[idx] -= 1e20;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+    VLOG(3) << "Enter SampleLogitsCUDAKernel";
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool uniq = context.Attr<bool>("uniq");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx = context.cuda_device_context();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
+
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    int threads = 512;
+    size_t size = batch_size * num_true;
+    int grid = (size + threads - 1) / threads;
+    GPUSetLabel<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, num_true, sampled_labels_data);
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob = math::GPUSampleWithProb<T>();
+      sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    const auto num_take = samples->dims()[1];
+    const auto array_dims = logits->dims();
+    const auto idx_dims = samples->dims();
+
+    const T* p_array = logits->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    T* p_value = sampled_logits->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    size = batch_size * num_take;
+    grid = (size + threads - 1) / threads;
+    GPUTakeAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+
+    if (remove_accidental_hits) {
+      const size_t size = batch_size * (num_true + num_samples);
+      int grid = (size + threads - 1) / threads;
+      gpu_compute_remove_accidental_hits<
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+          size, num_true, idx_slice_size, p_index, p_value);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    const auto batch_size = samples->dims()[0];
+    const auto num_put = samples->dims()[1];
+    const auto array_dims = logits_grad->dims();
+    const auto idx_dims = samples->dims();
+
+    T* p_array = logits_grad->data<T>();
+    const int64_t* p_index = samples->data<int64_t>();
+    const T* p_value = sampled_logits_grad->data<T>();
+
+    // src slice size
+    const auto array_slice_size = array_dims[1];
+    // index slice size
+    const auto idx_slice_size = idx_dims[1];
+
+    int threads = 128;
+    const size_t size = batch_size;
+    int grid = (size + threads - 1) / threads;
+
+    GPUPutAlongD1<
+        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
+        p_value);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
+                        ops::SampleLogitsCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(sample_logits_grad,
+                        ops::SampleLogitsGradCUDAKernel<float>,
+                        ops::SampleLogitsGradCUDAKernel<double>);
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sample_prob.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+// UNDERSTAND: something like take_along_axis in numpy.
+template <typename T>
+static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
+                           const framework::Tensor& array,
+                           const framework::Tensor& index,
+                           framework::Tensor* value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 &&
+                 index.dims()[0] == array.dims()[0] &&
+                 index.dims() == value->dims());
+
+  const auto batch_size = index.dims()[0];
+  const auto num_take = index.dims()[1];
+  const auto array_dims = array.dims();
+  const auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  const T* p_array = array.data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  T* p_value = value->data<T>();
+
+  // src slice size
+  const auto array_slice_size = array_dims[1];
+
+  // index slice size
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_take; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_value[i * value_slice_size + j] =
+          p_array[i * array_slice_size + array_index];
+    }
+  }
+}
+
+// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
+// indices, scatter is done in += way.
+template <typename T>
+static void CPUPutAlongD1(const platform::DeviceContext& ctx,
+                          framework::Tensor* array,
+                          const framework::Tensor& index,
+                          const framework::Tensor& value) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
+  PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 &&
+                 index.dims()[0] == array->dims()[0] &&
+                 index.dims() == value.dims());
+  const auto batch_size = index.dims()[0];
+  const auto num_put = index.dims()[1];
+  auto array_dims = array->dims();
+  auto idx_dims = index.dims();
+
+  // UNDERSTAND: no allocations here
+  T* p_array = array->data<T>();
+  const int64_t* p_index = index.data<int64_t>();
+  const T* p_value = value.data<T>();
+
+  // slice sizes
+  const auto array_slice_size = array_dims[1];
+  const auto idx_slice_size = idx_dims[1];
+  const auto value_slice_size = idx_slice_size;
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < num_put; ++j) {
+      auto array_index = p_index[i * idx_slice_size + j];
+      p_array[i * array_slice_size + array_index] +=
+          p_value[i * value_slice_size + j];
+    }
+  }
+}
+
+// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
+// logits by a float max, here 1e20
+template <typename T>
+static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
+                                           framework::Tensor* sampled_logits,
+                                           const framework::Tensor& samples,
+                                           const int num_true) {
+  const auto batch_size = sampled_logits->dims()[0];
+  const auto num_sampled_classes = sampled_logits->dims()[1];
+  T* sampled_logits_data = sampled_logits->data<T>();
+  const auto samples_data = samples.data<int64_t>();
+
+  std::unordered_set<int64_t> tmp_true_labels;
+  for (int i = 0; i < batch_size; ++i) {
+    tmp_true_labels.clear();
+    tmp_true_labels.insert(samples_data + i * num_sampled_classes,
+                           samples_data + i * num_sampled_classes + num_true);
+    for (int j = num_true; j < num_sampled_classes; ++j) {
+      const auto idx = i * num_sampled_classes + j;
+      if (tmp_true_labels.find(samples_data[idx]) != tmp_true_labels.end())
+        sampled_logits_data[idx] -= 1e20;
+    }
+  }
+}
+
+template <typename T>
+class SampleLogitsKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    VLOG(3) << "Enter SampleLogitsKernel";
+    // get necessary inputs
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Labels");
+
+    // get necessary outputs
+    Tensor* samples = context.Output<Tensor>("Samples");
+    Tensor* probabilities = context.Output<Tensor>("Probabilities");
+    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
+    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+
+    // shapes
+    const auto batch_size = logits->dims()[0];
+    const auto num_classes = logits->dims()[1];
+    const auto labels_dim = labels->dims();
+    const auto num_true = labels_dim[1];
+    const auto samples_dim = samples->dims();
+
+    // attrs
+    const auto num_samples = context.Attr<int>("num_samples");
+    const bool use_customized_samples =
+        context.Attr<bool>("use_customized_samples");
+    const bool remove_accidental_hits =
+        context.Attr<bool>("remove_accidental_hits");
+
+    // device contexts
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+
+    // UNDERSTAND: allocate memories for temporaries
+    sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
+    auto sampled_labels_data =
+        sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
+    for (int i = 0; i < batch_size; ++i) {
+      for (int j = 0; j < num_true; ++j) {
+        sampled_labels_data[i * num_true + j] = j;
+      }
+    }
+
+    if (use_customized_samples) {
+      const Tensor* customized_samples =
+          context.Input<Tensor>("CustomizedSamples");
+      const Tensor* customized_probabilities =
+          context.Input<Tensor>("CustomizedProbabilities");
+      samples->ShareDataWith(*customized_samples);
+      probabilities->ShareDataWith(*customized_probabilities);
+    } else {
+      samples->mutable_data<int64_t>(context.GetPlace());
+      probabilities->mutable_data<T>(samples_dim, context.GetPlace());
+      // UNDERSTAND: sampling
+      const auto seed = context.Attr<int>("seed");
+      auto sampler_with_prob =
+          math::SampleWithProb<platform::CPUDeviceContext, T>();
+      sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed),
+                        num_samples, labels, samples, probabilities);
+    }
+
+    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
+    CPUTakeAlongD1<T>(dev_ctx, *logits, *samples, sampled_logits);
+    if (remove_accidental_hits) {
+      compute_remove_accidental_hits<T>(dev_ctx, sampled_logits, *samples,
+                                        num_true);
+    }
+
+    // subtracted sampled logits with logQ(y|x)
+    auto probs = EigenMatrix<T>::From(*probabilities);
+    auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
+    smp_logits.device(*dev_ctx.eigen_device()) =
+        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
+            .unaryExpr(TolerableValue<T>());
+  }
+};
+
+template <typename T>
+class SampleLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* samples = context.Input<Tensor>("Samples");
+    const Tensor* sampled_logits_grad =
+        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    logits_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(dev_ctx, logits_grad, static_cast<T>(0));
+
+    // UNDERSTAND: scatter it back to logit_grad
+    CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
  }
 }

+template <typename T>
+static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
+                              const LoDTensor& x, LoDTensor* out,
+                              const framework::Vector<size_t>& x_lod,
+                              const framework::Vector<size_t>& ref_lod,
+                              bool do_copy) {
+  auto out_data = out->data<T>();
+  auto x_data = x.data<T>();
+
+  auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+  int x_item_length = x.numel() / x.dims()[0];
+  int out_offset = 0;
+  int num_copys = 0;
+  for (size_t i = 1; i < ref_lod.size(); ++i) {
+    int repeat_num = ref_lod[i] - ref_lod[i - 1];
+    int x_start = x_lod[i - 1];
+    int x_end = x_lod[i];
+    int x_seq_len = x_end - x_start;
+    if (repeat_num > 0) {
+      if (do_copy) {
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            memory::Copy(
+                gpu_place,
+                out_data + (out_start + j * x_seq_len + k) * x_item_length,
+                gpu_place, x_data + (x_start + k) * x_item_length,
+                sizeof(T) * x_item_length, context.stream());
+          }
+        }
+      } else {
+        num_copys += repeat_num * x_seq_len;
+      }
+    }
+    out_offset += repeat_num;
+  }
+  return num_copys;
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
  void operator()(
@@ -95,11 +139,30 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
      LoDTensor* out) {
+    int num_copys =
+        ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (num_copys < 5) {
+      ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
+    } else {
      int x_item_length = x.numel() / x.dims()[0];
-    framework::Vector<size_t> out_offset(x_lod.size());
+      size_t x_lod_size = x_lod.size();
+      framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
      GetOutputOffset(x_lod, ref_lod, &out_offset);

-    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+      for (size_t i = 0; i < x_lod_size; ++i) {
+        out_offset[x_lod_size + i] = x_lod[i];
+      }
+      for (size_t i = 0; i < ref_lod.size(); ++i) {
+        out_offset[2 * x_lod_size + i] = ref_lod[i];
+      }
+
+      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      const size_t* x_lod_data = out_offset_data + x_lod_size;
+      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
+
+      int thread_x =
+          std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
      int thread_y = 16;
      int thread_z = 1024 / thread_x / thread_y;
      int block_x = static_cast<int>(ref_lod.size());
@@ -107,10 +170,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
      dim3 grid_size(block_x, 1);

      sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
-        out->mutable_data<T>(context.GetPlace()));
+          x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
+          x_item_length, out->mutable_data<T>(context.GetPlace()));
+    }
  }
 };


--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -117,11 +117,11 @@ class TeacherStudentSigmoidLossOpMaker
              "[N x 1]. The teacher student sigmoid loss.");
    AddAttr<float>(
        "soft_max_up_bound",
-        "fp32, if input > soft_max_up_bound, will be bound, default 15.0")
+        "fp32, if input > soft_max_up_bound, input will be bound, default 15.0")
        .SetDefault(15.0);
-    AddAttr<float>(
-        "soft_max_lower_bound",
-        "fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
+    AddAttr<float>("soft_max_lower_bound",
+                   "fp32, if input < soft_max_lower_bound, input will be "
+                   "bound, default -15.0")
        .SetDefault(-15.0);
    AddComment(R"DOC(
 TeacherStudentSigmoidLoss Operator.

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -87,11 +87,11 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)

-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto device_context ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
+    nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
 else()
-    cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
+    cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
 endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,

  int tid = platform::get_cur_thread_id();

-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);

  // Find KeyBlob for current thread
  auto map_it = pMap->find(tid);
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(

  int tid = platform::get_cur_thread_id();

-  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+  std::lock_guard<std::mutex> lock(*p_mutex_);

  // Find KeyBlob for current thread firstly
  auto map_it = pMap->find(tid);

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -136,7 +136,7 @@ void EnableActivity() {
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
  // We don't track these activities for now.
-  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
@@ -155,7 +155,7 @@ void DisableActivity() {
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                memcpy->correlationId, memcpy->bytes);
            break;
          }
+          case CUPTI_ACTIVITY_KIND_MEMSET: {
+            auto *memset =
+                reinterpret_cast<const CUpti_ActivityMemset *>(record);
+            tracer->AddKernelRecords("MEMSET", memset->start, memset->end,
+                                     memset->deviceId, memset->streamId,
+                                     memset->correlationId);
+            break;
+          }
          case CUPTI_ACTIVITY_KIND_DRIVER: {
            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
            if (api->start != 0 && api->end != 0)
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
    const std::vector<int> cbids {
      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
+          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
 #if CUDA_VERSION >= 9000
@@ -601,6 +611,8 @@ void initCuptiCbidStr() {
  REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
 #if CUDA_VERSION >= 9000
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }

-class Event;
-
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
 // 2. Collect cuda statistics: start/end ts, memory, etc.

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include <type_traits>
 #include <utility>

+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+enum EventType { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventType type, std::string name, uint32_t thread_id);
+
+  const EventType& type() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+
+#ifdef PADDLE_WITH_CUDA
+#ifndef PADDLE_WITH_CUPTI
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventType type_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_CUPTI
+  int64_t gpu_ns_ = 0;
+
+ public:
+  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
+    gpu_ns_ += end_ns - start_ns;
+  }
+
+ private:
+#else
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+#endif
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -39,6 +39,45 @@ class MKLDNNHandler {
    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
  }

+  // TODO(jczaja): extract common part and make AcquireMemory
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_src_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::primitive_desc& mpd, void* ptr) {
+    auto local_key = key_ + "@user_weights_mem_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
      const mkldnn::memory::desc& md, void* ptr,
      user_function custom_func = {}) {
@@ -232,7 +271,6 @@ class MKLDNNHandler {
    AppendKey(key, suffix);
  }

- protected:
  static void AppendKeyDims(std::string* key,
                            const mkldnn::memory::dims& dims) {
    for (unsigned int i = 0; i < dims.size(); i++) {
@@ -250,6 +288,7 @@ class MKLDNNHandler {
    key->append(s);
  }

+ protected:
  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
    std::string dstr = "";
    for (size_t i = 0; i < operand_dims.size(); ++i) {
@@ -263,6 +302,9 @@ class MKLDNNHandler {
  mkldnn::engine engine_;
  std::string key_;
  bool is_reusing_;
+
+ public:
+  static constexpr int MaxKeyLength = 256;
 };

 class TransposeMKLDNNHandler : public MKLDNNHandler {
@@ -273,37 +315,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
                         mkldnn::engine engine, const std::string& base_key)
      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
        dims_(dims),
-        axis_(axis),
-        logical_axis_(dims.size(), 0) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::format& fmt, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   " find mem primitive in device context");
-    if (mem_p == nullptr) {
-      // Make memory descriptor using input format, unless it
-      // cannot be trusted (nchw) then make up memory fmt manually
-      for (size_t i = 0; i < logical_axis_.size(); ++i) {
-        logical_axis_[i] = i;
-      }
-      auto src_md = fmt != mkldnn::memory::format::nchw
-                        ? platform::MKLDNNMemDesc(
-                              dims_, platform::MKLDNNGetDataType<float>(), fmt)
-                        : Axis2MemoryDesc(dims_, logical_axis_);
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
+        axis_(axis) {}

  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
                                                   platform::Place place) {
@@ -388,7 +400,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
 private:
  std::vector<int> dims_;
  std::vector<int> axis_;
-  std::vector<int> logical_axis_;
 };

 template <class forward_t, class backward_data_t, class backward_weights_t>
@@ -548,9 +559,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                   "Fail to find convolution primitive in device context");
    if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
-                                           *(weights_memory_p.get()),
-                                           *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *dst_memory_p);

      dev_ctx_.SetBlob(prim_key, conv_p);
    } else {
@@ -570,9 +580,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
                   "Fail to find convolution primitive in device context");
    if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
+                                           *weights_memory_p, *bias_memory_p,
+                                           *dst_memory_p);

      dev_ctx_.SetBlob(prim_key, conv_p);
    } else {

--- a/paddle/fluid/platform/mkldnn_utils.h
+++ b/paddle/fluid/platform/mkldnn_utils.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <mkldnn.h>
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
+    const std::vector<int>& ltz, mkldnn::memory::format fmt,
+    mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
+  mkldnn_memory_desc_t mem_fmt;
+
+  mem_fmt.primitive_kind = mkldnn_memory;
+  mem_fmt.ndims = ltz.size();
+  for (unsigned int i = 0; i < ltz.size(); ++i) {
+    mem_fmt.dims[i] = ltz[i];  // logical dimensions (nchw format,
+                               // regardless physical layout)
+  }
+  mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
+  mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
+
+  unsigned int total_stride = 1;
+  for (int i = ltz.size() - 1; i >= 0; --i) {
+    mem_fmt.layout_desc.blocking.padding_dims[i] =
+        ltz[i];  // logical dimensions (nchw format, regardless physical
+                 // layout)
+    mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+    mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+    mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
+    mem_fmt.layout_desc.blocking.strides[1][i] = 1;
+    total_stride *= ltz[i];
+  }
+  mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
+}
+
+inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
+    const std::vector<int>& ltz, const mkldnn::memory::format format,
+    const mkldnn::memory::data_type data_type) {
+  auto md = mkldnn::memory::desc({ltz}, data_type, format);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CPUPlace();
+  auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
+  auto& cpu_engine = dev_ctx->GetEngine();
+  return mkldnn::memory::primitive_desc(md, cpu_engine);
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -254,9 +254,11 @@ struct EventItem {
  std::string name;
  int calls;
  double total_time;
-  double min_time;
  double max_time;
  double ave_time;
+  double min_time;
+  double cpu_time;
+  double gpu_time;
  float ratio;
 };

@@ -290,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total";
+  if (g_state == ProfilerState::kAll) {
+    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
+              << std::setw(data_width * 2) << "GPU Time (Ratio)";
+  }
+  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
  for (size_t i = 0; i < events_table.size(); ++i) {
@@ -299,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
      const EventItem& event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.total_time;
+      if (g_state == ProfilerState::kAll) {
+        std::cout << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.cpu_time,
+                         (event_item.cpu_time / event_item.total_time))
+                  << std::setw(data_width * 2)
+                  << string::Sprintf(
+                         "%f (%f)", event_item.gpu_time,
+                         (event_item.gpu_time / event_item.total_time));
+      }
+      std::cout << std::setw(data_width) << event_item.min_time
                << std::setw(data_width) << event_item.max_time
                << std::setw(data_width) << event_item.ave_time
                << std::setw(data_width) << event_item.ratio << std::endl;
@@ -349,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
        return a.ave_time > b.ave_time;
      };
      break;
+    case EventSortingKey::kGPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.gpu_time > b.gpu_time;
+      };
+      break;
+    case EventSortingKey::kCPUTime:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.cpu_time > b.cpu_time;
+      };
+      break;
    default:
      sorted_domain = "event first end time";
  }
@@ -387,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
        }

        if (rit != pushed_events.rend()) {
-          double event_time = (g_state == ProfilerState::kCUDA ||
-                               g_state == ProfilerState::kAll)
-                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
-                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
+          double event_time = 0;
+          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
+          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
+          if (g_state == ProfilerState::kCUDA) {
+            event_time = gpu_time;
+          } else if (g_state == ProfilerState::kCPU) {
+            event_time = cpu_time;
+          } else {
+            event_time = gpu_time + cpu_time;
+          }
+
          total += event_time;

          std::string event_name;
@@ -407,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
                                    event_time, event_time, event_time,
-                                    0.};
+                                    gpu_time,   cpu_time,   0.};
            event_items.push_back(event_item);
          } else {
            int index = event_idx[event_name];
@@ -420,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
            // max time
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
+            event_items[index].gpu_time += gpu_time;
+            event_items[index].cpu_time += cpu_time;
          }

          // remove the push marker from the list

--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/profiler.h"
-
 #include <cuda.h>
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace platform {
@@ -22,26 +21,27 @@ namespace platform {
 __global__ void DummyKernel(int *a) { a[0] = 0; }

 static void ForEachDevice(std::function<void(int)> func) {
-  auto original_device = GetCurrentDeviceId();
-  int count = GetCUDADeviceCount();
+  auto original_device = platform::GetCurrentDeviceId();
+  int count = platform::GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
+    platform::SetDeviceId(i);
    func(i);
  }
-  SetDeviceId(original_device);
+  platform::SetDeviceId(original_device);
 }

 void DummyKernelAndEvent() {
  for (int i = 0; i < 5; i++) {
    ForEachDevice([](int d) {
-      CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+      platform::SetDeviceId(d);
+      cudaStream_t stream;
+      PADDLE_ENFORCE(cudaStreamCreate(&stream));
      Mark("_cuda_startup_");
      int *ptr;
      PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
-      DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
-      dev_ctx->Wait();
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
      PADDLE_ENFORCE(cudaFree(ptr));
-      delete dev_ctx;
    });
  }
 }

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -17,54 +17,13 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace platform {
-
-enum EventType { kMark, kPushRange, kPopRange };
-
-class Event {
- public:
-  // The DeviceContext is used to get the cuda stream.
-  // If CPU profiling mode, can pass nullptr.
-  Event(EventType type, std::string name, uint32_t thread_id);
-
-  const EventType& type() const;
-  std::string name() const { return name_; }
-  uint32_t thread_id() const { return thread_id_; }
-
-#ifdef PADDLE_WITH_CUDA
-#ifndef PADDLE_WITH_CUPTI
-  cudaEvent_t event() const { return event_; }
-  int device() const { return device_; }
-#endif
-#endif
-
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
-
- private:
-  EventType type_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/event.h"
 #ifdef PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_CUPTI
-  int64_t gpu_ns_ = 0;
-
- public:
-  void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
-    gpu_ns_ += end_ns - start_ns;
-  }
-
- private:
-#else
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
+#include "paddle/fluid/platform/gpu_info.h"
 #endif
-};
+namespace paddle {
+namespace platform {

 enum ProfilerState {
  kDisabled,  // disabled state
@@ -117,7 +76,16 @@ struct RecordBlock {
 std::vector<std::vector<Event>> GetAllEvents();

 // Candidate keys to sort the profiling report
-enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+enum EventSortingKey {
+  kDefault,
+  kCalls,
+  kTotal,
+  kMin,
+  kMax,
+  kAve,
+  kCPUTime,
+  kGPUTime
+};

 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
 }

 TEST(RecordEvent, RecordEvent) {
-  using paddle::platform::DeviceContext;
  using paddle::platform::Event;
  using paddle::platform::EventType;
  using paddle::platform::RecordEvent;

--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto* dev_ctx =
        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);

    int numel = memory_size / sizeof(float);
    framework::Tensor tensor =
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto* dev_ctx =
        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
    int numel = memory_size / sizeof(float);
    framework::Tensor tensor =
        ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto* dev_ctx =
        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);
    int numel = memory_size / sizeof(float);

    framework::Tensor out_side_tensor;
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto* dev_ctx =
        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
-    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr);

    size_t memory_size = 500;
    int numel = memory_size / sizeof(float);

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,7 +34,7 @@ void BindTracer(pybind11::module* m) {
              framework::BlockDesc* block,
              const platform::CPUPlace expected_place,
              const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
+             return self.Trace(op, inputs, outputs, block, expected_place,
                               stop_gradient);
           })
      .def("trace",
@@ -44,7 +44,7 @@ void BindTracer(pybind11::module* m) {
              framework::BlockDesc* block,
              const platform::CUDAPlace expected_place,
              const bool stop_gradient = false) {
-             self.Trace(op, inputs, outputs, block, expected_place,
+             return self.Trace(op, inputs, outputs, block, expected_place,
                               stop_gradient);
           })
      .def("py_trace", &imperative::Tracer::PyTrace,

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/pybind/ir.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -101,7 +102,8 @@ void BindGraph(py::module *m) {
           [](Graph &self, Node &node) { return self.RemoveNode(&node); })
      .def("retrieve_node", &Graph::RetrieveNode,
           return_value_policy::reference)
-      .def("resolve_hazard", &Graph::ResolveHazard);
+      .def("resolve_hazard", &Graph::ResolveHazard)
+      .def("origin_program_desc", &Graph::OriginProgram);
 }

 void BindNode(py::module *m) {
@@ -115,7 +117,7 @@ void BindNode(py::module *m) {
      .def("is_var", &Node::IsVar)
      .def("is_ctrl_var", &Node::IsCtrlVar)
      .def("clear_inputs", [](Node &self) { self.inputs.clear(); })
-      .def("inputs_remove",
+      .def("remove_input",
           [](Node &self, int node_id) {
             auto pos = std::find_if(
                 self.inputs.begin(), self.inputs.end(),
@@ -124,7 +126,7 @@ void BindNode(py::module *m) {
               self.inputs.erase(pos);
             }
           })
-      .def("inputs_remove",
+      .def("remove_input",
           [](Node &self, Node &node) {
             auto pos =
                 std::find(self.inputs.begin(), self.inputs.end(), &node);
@@ -132,10 +134,10 @@ void BindNode(py::module *m) {
               self.inputs.erase(pos);
             }
           })
-      .def("inputs_append",
+      .def("append_input",
           [](Node &self, Node &node) { self.inputs.push_back(&node); })
      .def("clear_outputs", [](Node &self) { self.outputs.clear(); })
-      .def("outputs_remove",
+      .def("remove_output",
           [](Node &self, int node_id) {
             auto pos = std::find_if(
                 self.outputs.begin(), self.outputs.end(),
@@ -144,7 +146,7 @@ void BindNode(py::module *m) {
               self.outputs.erase(pos);
             }
           })
-      .def("outputs_remove",
+      .def("remove_output",
           [](Node &self, Node &node) {
             auto pos =
                 std::find(self.outputs.begin(), self.outputs.end(), &node);
@@ -152,7 +154,7 @@ void BindNode(py::module *m) {
               self.outputs.erase(pos);
             }
           })
-      .def("outputs_append",
+      .def("append_output",
           [](Node &self, Node &node) { self.outputs.push_back(&node); })
      .def_readwrite("inputs", &Node::inputs)
      .def_readwrite("outputs", &Node::outputs);

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -189,6 +189,8 @@ void BindBlockDesc(pybind11::module *m) {
             return self.HasVar(name);
           },
           pybind11::return_value_policy::reference)
+      .def("_clear_block", [](pd::BlockDesc &self) { return self.Clear(); },
+           pybind11::return_value_policy::reference)
      .def("_rename_var",
           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
              const pybind11::bytes &byte_name_new) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -976,6 +976,7 @@ All parameter, weight, gradient are variables in Paddle.
           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });

  // -- python binds for parallel executor.
+
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
    ExecutionStrategy allows the user to more preciously control how to run
@@ -1213,9 +1214,9 @@ All parameter, weight, gradient are variables in Paddle.
                cannot be updated after being finalized.)DOC");

  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const ProgramDesc &,
-                  const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &>())
+                  const std::unordered_set<std::string> &, const std::string &,
+                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
+                  const BuildStrategy &, ir::Graph *>())
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
      // We still cannot get local_scope from this vector, since the element
      // of vec<Scope*> will be freed by Python GC. We can only return Scope*

--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -73,7 +73,7 @@ int main() {
  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");

  // init all parameters
-  executor.Run(*startup_program.get(), &scope, 0);
+  executor.Run(*startup_program, &scope, 0);

  // prepare data
  auto x_var = scope.Var("x");
@@ -101,7 +101,7 @@ int main() {
  clock_t t1 = clock();

  for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
    std::cout << "step: " << i << " loss: "
              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
              << std::endl;

--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -74,7 +74,7 @@ void Train() {
  float first_loss = 0.0;
  float last_loss = 0.0;
  for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program.get(), &scope, 0, false, true);
+    executor.Run(*train_program, &scope, 0, false, true);
    if (i == 0) {
      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
    } else if (i == 99) {

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -444,6 +444,7 @@ function assert_api_spec_approvals() {
               "paddle/fluid/framework/ir/node.h"
               "paddle/fluid/framework/ir/graph.h"
               "paddle/fluid/framework/framework.proto"
+               "python/paddle/fluid/compiler.py"
               "paddle/fluid/operators/distributed/send_recv.proto.in")
    for API_FILE in ${API_FILES[*]}; do
      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,8 +17,10 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework

 from . import core
+from . import framework

 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']

@@ -36,7 +38,7 @@ def _place_obj(place):

 class CompiledProgram(object):
    """
-    Compiles a Program for execution.
+    Compiles to Graph for execution.

    1. Users first create the program with layers.
    2. Optionally, users use CompiledProgram to optimize the program before run.
@@ -51,7 +53,7 @@ class CompiledProgram(object):

    Example:
        .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)
            compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
@@ -62,11 +64,25 @@ class CompiledProgram(object):
                                     fetch_list=[loss.name])

    Args:
-        program: Program instance that contains the model logic.
+        program_or_graph (Graph|Program): If it's Program, it will be first
+            lowered to a graph for further optimizations. If it's a graph
+            (potentially optimized before), it will be directly used for
+            further optimizations. Note: graph is only supported when compiled
+            with with_data_parallel option.
    """

-    def __init__(self, program):
-        self._program = program
+    def __init__(self, program_or_graph):
+        if isinstance(program_or_graph, core.Graph):
+            self._graph = program_or_graph
+            self._program = None
+        elif isinstance(program_or_graph, framework.Program):
+            self._graph = core.Graph(program_or_graph.desc)
+            self._program = program_or_graph
+        else:
+            raise ValueError("Wrong program_to_graph type: %s" %
+                             type(program_or_graph))
+
+        self._program_desc = self._graph.origin_program_desc()
        self._scope = None
        self._place = None
        self._executor = None
@@ -101,6 +117,7 @@ class CompiledProgram(object):
            self
        """
        assert not self._is_data_parallel, "Already compiled with parallel."
+        assert not self._is_inference, "Cannot compile both data parallel and inference"
        self._is_data_parallel = True
        self._build_strategy = build_strategy
        self._exec_strategy = exec_strategy
@@ -110,6 +127,8 @@ class CompiledProgram(object):
            self._exec_strategy = ExecutionStrategy()
        if self._build_strategy is None:
            self._build_strategy = BuildStrategy()
+        self._build_strategy.is_distribution = framework.is_pserver_mode(
+            self._program)
        return self

    def with_inference_optimize(self, config):
@@ -120,11 +139,13 @@ class CompiledProgram(object):
        Returns:
            self
        """
+        assert not self._is_data_parallel, "Cannot compile both data parallel and inference"
+        assert not self._is_inference, "Already compiled with inference"
+
        assert any([
            isinstance(config, InferNativeConfig),
            isinstance(config, InferAnalysisConfig)
        ])
-        self._is_data_parallel = False
        self._is_inference = True
        self._infer_config = config
        return self
@@ -173,37 +194,41 @@ class CompiledProgram(object):
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                self._exec_strategy.num_threads = cpu_num * 2

-        trainers_endpoints = self._program._trainers_endpoints
-
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
        if self._build_strategy.memory_optimize is None:
-            self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
+            self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
        if self._build_strategy.enable_inplace is None:
-            self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+            self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
+
+        # TODO(wuyi): trainer endpoings should be passed in through
+        # build_strategy, not program.xxx.
+        if self._program and self._build_strategy.num_trainers > 1 and \
+                self._program._trainers_endpoints:
+            tps = self._program._trainers_endpoints

-        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
            assert self._build_strategy.num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
-            self._build_strategy.trainers_endpoints = trainers_endpoints
-
-        self._persistable_vars = set([
-            cpt.to_text(v.name)
-            for v in [
-                var for var in self._program.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
+                tps), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = tps
+
+        self._persistable_vars = []
+        for block_id in range(self._program_desc.num_blocks()):
+            bdesc = self._program_desc.block(block_id)
+            self._persistable_vars.extend([
+                cpt.to_text(v.name()) for v in bdesc.all_vars()
+                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
            ])

        places = list(map(_place_obj, self._places))
+
        return core.ParallelExecutor(
-            places, self._persistable_vars, self._program.desc,
+            places,
+            set(self._persistable_vars),
            cpt.to_text(self._loss_name)
            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy)
+            self._exec_strategy, self._build_strategy, self._graph)

    def _compile_inference(self):
-        assert self._is_data_parallel is False
        return core.create_paddle_predictor(self._infer_config)

    def _compile(self, scope, place):

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -17,7 +17,9 @@ import numpy as np
 import six
 from ..... import compat as cpt
 from .... import core
+from .... import Executor
 from ....framework import IrGraph
+from ....framework import IrNode
 from ....framework import Program
 from ....initializer import Constant
 from .... import unique_name
@@ -31,7 +33,7 @@ __all__ = [
 class QuantizationTransformPass(object):
    def __init__(self,
                 scope=None,
-                 program_exe=None,
+                 place=None,
                 weight_bits=8,
                 activation_bits=8,
                 activation_quantize_type='abs_max',
@@ -45,7 +47,7 @@ class QuantizationTransformPass(object):
            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
            type, this pass will create some new parameters. The scope is used to
            initialize these new parameters.
-            program_exe(fluid.Executor): program_exe is used to initialize new
+            place(fluid.CPUPlace|fluid.CUDAPlace): place is used to initialize new
            parameters described above.
            weight_bits (int): quantization bit number for weights,
                the bias is not quantized.
@@ -71,13 +73,13 @@ class QuantizationTransformPass(object):
            from paddle.fluid import core

            graph = IrGraph(core.Graph(program.desc), for_test=False)
-            exe = fluid.Executor(fluid.CPUPlace())
+            place = fluid.CPUPlace()
            transform_pass = QuantizationTransformPass(fluid.global_scope(),
-            exe)
+            place)
            transform_pass.apply(graph)
        """
        self._scope = scope
-        self._program_exe = program_exe
+        self._place = place
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits

@@ -118,7 +120,7 @@ class QuantizationTransformPass(object):
        self._is_test = graph.is_test()
        # marked the variable which has been dequantized.
        dequantized_vars = collections.OrderedDict()
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]

        def _transform_forward(graph, op):
            for var_node in op.inputs:
@@ -149,7 +151,7 @@ class QuantizationTransformPass(object):

        if not self._is_test:
            self._create_global_step(graph)
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        # The process of _transform_forward and _transform_backward is needed in two for loops.
        # The loop for transforming the forward graph:
        for op in ops:
@@ -163,8 +165,8 @@ class QuantizationTransformPass(object):
        if len(self._need_initialized) > 0:
            assert self._scope is not None, \
            'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-            assert self._program_exe is not None, \
-            'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
+            assert self._place is not None, \
+            'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
            init_program = Program()
            for var_desc, initializer in six.iteritems(self._need_initialized):
                var = init_program.global_block().create_var(
@@ -175,7 +177,8 @@ class QuantizationTransformPass(object):
                    lod_level=var_desc.lod_level(),
                    persistable=var_desc.persistable())
                initializer(var, init_program.global_block())
-            self._program_exe.run(program=init_program, scope=self._scope)
+            exe = Executor(self._place)
+            exe.run(program=init_program, scope=self._scope)

        return graph

@@ -183,11 +186,11 @@ class QuantizationTransformPass(object):
        if self._weight_quantize_type == 'range_abs_max' or \
                self._activation_quantize_type == 'range_abs_max':
            counter_name = cpt.to_text('@STEP_COUNTER@')
-            for node in graph.all_vars():
+            for node in graph.all_var_nodes():
                if node.name() == counter_name:
                    self._global_step = node
            if self._global_step is None:
-                global_step_in = graph.create_param_node(
+                global_step_in = graph.create_persistable_node(
                    name=counter_name,
                    var_type=core.VarDesc.VarType.LOD_TENSOR,
                    shape=[1],
@@ -228,14 +231,14 @@ class QuantizationTransformPass(object):

        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
        scale_var_node = graph.create_var_node(
            name=self._quantized_scale_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_abs_max',
            attrs={
@@ -258,15 +261,15 @@ class QuantizationTransformPass(object):

        quant_var_node = graph.create_var_node(
            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())

-        scale_in_node = graph.create_param_node(
+        scale_in_node = graph.create_persistable_node(
            name=self._quantized_scale_name(var_node.name()),
            var_type=core.VarDesc.VarType.LOD_TENSOR,
            shape=[1],
-            var_dtype=var_node.var().dtype())
+            var_dtype=var_node.dtype())
        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)

        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
@@ -275,11 +278,11 @@ class QuantizationTransformPass(object):

        if not self._is_test:
            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
-            scales_node = graph.create_param_node(
+            scales_node = graph.create_persistable_node(
                name=unique_name.generate('scales'),
                var_type=core.VarDesc.VarType.LOD_TENSOR,
                shape=[self._window_size],
-                var_dtype=var_node.var().dtype())
+                var_dtype=var_node.dtype())
            self._need_initialized[scales_node.var()] = Constant(value=0)
            inputs['Iter'] = self._global_step
            outputs['OutScales'] = scales_node
@@ -314,9 +317,9 @@ class QuantizationTransformPass(object):

        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(var_node.name()),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
-            var_dtype=var_node.var().dtype())
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
        max_range = (1 << (quant_bits - 1)) - 1
        dequant_op_node = graph.create_op_node(
            op_type='fake_dequantize_max_abs',
@@ -400,22 +403,22 @@ class QuantizationFreezePass(object):
        Args:
            graph(IrGraph): the applied graph.
        """
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
-        ops = graph.all_ops()
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        ops = graph.all_op_nodes()
        for op_node in ops:
            op_name = op_node.name()
            if op_name in self._fake_quant_op_names:
-                input_arg_name = op_node.op().input('X')[0]
+                input_arg_name = op_node.input('X')[0]
                if input_arg_name in persistable_vars:
                    if self._weight_quantize_type == 'abs_max':
                        param = self._load_var(input_arg_name)
                        scale_v = np.max(np.abs(param))
                    else:
-                        scale_v = self._load_var(op_node.op().output('OutScale')
-                                                 [0])[0]
+                        scale_v = self._load_var(
+                            op_node.output('OutScale')[0])[0]
                    self._var_scale_map[input_arg_name] = scale_v
                else:
-                    scale_v = graph.var_node(op_node.op().output('OutScale')[0])
+                    scale_v = graph.var_node(op_node.output('OutScale')[0])
                    self._var_scale_map[input_arg_name] = scale_v
                if input_arg_name in persistable_vars:
                    self._remove_fake_quant_and_dequant_op(graph, op_node)
@@ -425,13 +428,13 @@ class QuantizationFreezePass(object):
                                                    self._weight_bits)
                    self._restore_var(input_arg_name, quantized_param_v)

-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        for op_node in ops:
            op_name = op_node.name()
            if op_name in self._fake_dequant_op_names:
                self._remove_fake_quant_and_dequant_op(graph, op_node)

-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        for op_node in ops:
            op_name = op_node.name()
            if op_name in self._quantizable_ops:
@@ -451,8 +454,8 @@ class QuantizationFreezePass(object):
        return graph

    def _remove_fake_quant_and_dequant_op(self, graph, op_node):
-        k = op_node.op().output('Out')[0]
-        v = op_node.op().input('X')[0]
+        k = op_node.output('Out')[0]
+        v = op_node.input('X')[0]
        if v not in self._op_input_rename_map:
            self._op_input_rename_map[k] = v
        else:
@@ -462,7 +465,7 @@ class QuantizationFreezePass(object):
    def _insert_post_dequant_op(self, graph, op_node):
        max_range = None
        scale_var_node = None
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
        for var_node in op_node.inputs:
            name = var_node.name()
            if name in self._op_input_rename_map:
@@ -480,7 +483,7 @@ class QuantizationFreezePass(object):
                        original_var_name)
                max_range = param_range * act_range / scale_v
            else:
-                assert isinstance(scale_v, core.Node)
+                assert isinstance(scale_v, IrNode)
                scale_var_node = self._var_scale_map[original_var_name]

        if len(op_node.outputs) != 1:
@@ -490,9 +493,9 @@ class QuantizationFreezePass(object):
        output_var_node = op_node.outputs[0]
        dequant_var_node = graph.create_var_node(
            name=self._dequantized_var_name(output_var_node.name()),
-            var_type=output_var_node.var().type(),
-            shape=output_var_node.var().shape(),
-            var_dtype=output_var_node.var().dtype())
+            var_type=output_var_node.type(),
+            shape=output_var_node.shape(),
+            var_dtype=output_var_node.dtype())
        dequant_op_node = graph.create_op_node(
            op_type='fake_dequantize_max_abs',
            attrs={
@@ -517,14 +520,19 @@ class QuantizationFreezePass(object):

    def _remove_unused_var_nodes(self, graph):
        all_used_vars = set()
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        for op_node in ops:
            for input_node in op_node.inputs:
                all_used_vars.add(input_node)
            for output_node in op_node.outputs:
                all_used_vars.add(output_node)

-        all_unused_vars = graph.all_vars() - all_used_vars
+        all_used_vars = {n.node for n in all_used_vars}
+        all_unused_vars = {
+            n
+            for n in filter(lambda node: node.node not in all_used_vars,
+                            graph.all_var_nodes())
+        }
        graph.safe_remove_nodes(all_unused_vars)

    def _original_var_name(self, var_name):
@@ -583,8 +591,8 @@ class ConvertToInt8Pass(object):
        Args:
            graph(IrGraph): the applied graph.
        """
-        persistable_vars = [p.name() for p in graph.all_persistable_vars()]
-        ops = graph.all_ops()
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        ops = graph.all_op_nodes()
        input_map = {}
        for op_node in ops:
            op_name = op_node.name()
@@ -605,10 +613,10 @@ class ConvertToInt8Pass(object):

    def _convert_to_int8(self, graph, var_node):
        int8_var_node_name = var_node.name() + ".int8"
-        int8_var_node = graph.create_param_node(
+        int8_var_node = graph.create_persistable_node(
            name=cpt.to_text(int8_var_node_name),
-            var_type=var_node.var().type(),
-            shape=var_node.var().shape(),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
            var_dtype=core.VarDesc.VarType.INT8)
        array = self._load_var(var_node.name())
        self._scope.var(int8_var_node_name)
@@ -624,14 +632,19 @@ class ConvertToInt8Pass(object):

    def _remove_unused_var_nodes(self, graph):
        all_used_vars = set()
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        for op_node in ops:
            for input_node in op_node.inputs:
                all_used_vars.add(input_node)
            for output_node in op_node.outputs:
                all_used_vars.add(output_node)

-        all_unused_vars = graph.all_vars() - all_used_vars
+        all_used_vars = {n.node for n in all_used_vars}
+        all_unused_vars = {
+            n
+            for n in filter(lambda node: node.node not in all_used_vars,
+                            graph.all_var_nodes())
+        }
        graph.safe_remove_nodes(all_unused_vars)


@@ -655,11 +668,11 @@ class TransformForMobilePass(object):
        Args:
            graph(IrGraph): the graph will be transformed.
        """
-        ops = graph.all_ops()
+        ops = graph.all_op_nodes()
        for op_node in ops:
            name = op_node.name()
            if name in self._fake_quant_op_names:
-                op_node.op().set_type('quantize')
+                op_node.set_type('quantize')
                quant_node = graph.create_op_node_from_desc(op_node.op())
                for input_node in op_node.inputs:
                    graph.link_to(input_node, quant_node)
@@ -667,7 +680,7 @@ class TransformForMobilePass(object):
                    graph.link_to(quant_node, output_node)
                graph.safe_remove_nodes(op_node)
            if name in self._fake_dequant_op_names:
-                op_node.op().set_type('dequantize')
+                op_node.set_type('dequantize')
                dequant_node = graph.create_op_node_from_desc(op_node.op())
                for input_node in op_node.inputs:
                    graph.link_to(input_node, dequant_node)

--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -61,16 +61,16 @@ class TestGraph(unittest.TestCase):
            opt.minimize(loss)
        graph = IrGraph(core.Graph(main.desc), for_test=False)
        marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
            if op.name().find('conv2d') > -1:
                marked_nodes.add(op)
        graph.draw('.', 'residual', marked_nodes)
        self.assertFalse(graph.has_circle())
        self.assertEqual(graph.graph_num(), 1)
        nodes = graph.topology_sort()
-        self.assertEqual(len(nodes), len(graph.all_ops()))
+        self.assertEqual(len(nodes), len(graph.all_op_nodes()))
        nodes_map = graph.build_adjacency_list()
-        self.assertEqual(len(nodes_map), len(graph.all_ops()))
+        self.assertEqual(len(nodes_map), len(graph.all_op_nodes()))
        nodes_num = len(graph.all_nodes())
        graph.safe_remove_nodes(marked_nodes)
        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))

--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -130,15 +130,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
            loss = linear_fc(3)
            opt = fluid.optimizer.Adam(learning_rate=0.001)
            opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
        graph = IrGraph(core.Graph(main.desc), for_test=False)
        transform_pass = QuantizationTransformPass(
            scope=fluid.global_scope(),
-            program_exe=exe,
+            place=place,
            activation_quantize_type=quant_type)
        transform_pass.apply(graph)
        marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
@@ -146,7 +147,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
        self.check_program(transform_pass, program)
        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
        val_marked_nodes = set()
-        for op in val_graph.all_ops():
+        for op in val_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                val_marked_nodes.add(op)
        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
@@ -166,15 +167,16 @@ class TestQuantizationTransformPass(unittest.TestCase):
            loss = residual_block(2)
            opt = fluid.optimizer.Adam(learning_rate=0.001)
            opt.minimize(loss)
-        exe = fluid.Executor(fluid.CPUPlace())
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
        graph = IrGraph(core.Graph(main.desc), for_test=False)
        transform_pass = QuantizationTransformPass(
            scope=fluid.global_scope(),
-            program_exe=exe,
+            place=place,
            activation_quantize_type=quant_type)
        transform_pass.apply(graph)
        marked_nodes = set()
-        for op in graph.all_ops():
+        for op in graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
@@ -182,7 +184,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
        self.check_program(transform_pass, program)
        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
        val_marked_nodes = set()
-        for op in val_graph.all_ops():
+        for op in val_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                val_marked_nodes.add(op)
        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
@@ -231,17 +233,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
        with fluid.scope_guard(scope):
            exe.run(startup)
        transform_pass = QuantizationTransformPass(
-            scope=scope, program_exe=exe, activation_quantize_type=quant_type)
+            scope=scope, place=place, activation_quantize_type=quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)
        dev_name = '_gpu_' if use_cuda else '_cpu_'
        marked_nodes = set()
-        for op in main_graph.all_ops():
+        for op in main_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
        marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
@@ -251,11 +253,6 @@ class TestQuantizationFreezePass(unittest.TestCase):
        iters = 5
        batch_size = 8

-        #train_exe = fluid.ParallelExecutor(
-        #    main_program=quantized_main_program,
-        #    use_cuda=bool(use_cuda),
-        #    loss_name=loss.name,
-        #    scope=scope)
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=500),
@@ -269,9 +266,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                loss_v = exe.run(program=quantized_main_program,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
-                #loss_v = train_exe.run(feed=feeder.feed(data),
-                #                       fetch_list=[loss.name])
-                #print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
+                print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))

        test_data = next(test_reader())
        with fluid.program_guard(quantized_test_program):
@@ -287,7 +282,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
        freeze_pass.apply(test_graph)
        marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
@@ -299,21 +294,21 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                  feed=feeder.feed(test_data),
                                  fetch_list=[loss])
        self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-        #print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
-        #print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
+        print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
+        print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
        w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
        # Maybe failed, this is due to the calculation precision
        # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-        #                      np.sum(w_freeze)))
-        #print('{}: {}'.format('w_quant' + dev_name + quant_type,
-        #                      np.sum(w_quant)))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))
+        print('{}: {}'.format('w_quant' + dev_name + quant_type,
+                              np.sum(w_quant)))

        # Convert parameter to 8-bit.
        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
        convert_int8_pass.apply(test_graph)
        marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
@@ -330,14 +325,14 @@ class TestQuantizationFreezePass(unittest.TestCase):
        w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
        self.assertEqual(w_8bit.dtype, np.int8)
        self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-        #print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
-        #print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-        #                      np.sum(w_freeze)))
+        print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
+        print('{}: {}'.format('w_freeze' + dev_name + quant_type,
+                              np.sum(w_freeze)))

        mobile_pass = TransformForMobilePass()
        mobile_pass.apply(test_graph)
        marked_nodes = set()
-        for op in test_graph.all_ops():
+        for op in test_graph.all_op_nodes():
            if op.name().find('quantize') > -1:
                marked_nodes.add(op)
        test_graph.draw('.', 'test_mobile' + dev_name + quant_type,

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -538,6 +538,8 @@ class Executor(object):
        else:
            # TODO(panyx0718): Can compile program to optimize executor
            # performance.
+            # TODO(panyx0718): executor should be able to run graph.
+            assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
            return self._run(
                program._program,
                self._default_executor,

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -87,6 +87,15 @@ def _current_expected_place():
    return _imperative_current_expected_place_


+def is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
 class NameScope(object):
    def __init__(self, name="", parent=None):
        self._children = dict()
@@ -378,16 +387,19 @@ class Variable(object):
                # get_capacity is implemented
                pass

-        self.block.vars[name] = self
-        self.op = None
-        self.stop_gradient = stop_gradient
-        self.is_data = is_data
        if _in_imperative_mode():
+            # record vars in tracer rather than blocks
            self._ivar = kwargs.get("ivar", None)
            if not self._ivar:
-                self._ivar = core.VarBase()
+                self._ivar = core.VarBase(stop_gradient)
            self._ivar.desc = self.desc
-            self._ivar.stop_gradient = stop_gradient
+            if persistable:
+                self.block.vars[name] = self
+        else:
+            self.block.vars[name] = self
+        self.op = None
+        self.stop_gradient = stop_gradient
+        self.is_data = is_data

    def _numpy(self):
        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
@@ -723,7 +735,6 @@ class Operator(object):
                self._update_desc_attr(attr_name, attr_val)

        self.desc.check_attrs()
-
        if self._has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)
@@ -731,6 +742,7 @@ class Operator(object):
        if _in_imperative_mode():
            self.iop = core.OpBase()
            self.iop.desc = self.desc
+
            self.inputs = defaultdict(list)
            if inputs is not None:
                for k, v in six.iteritems(inputs):
@@ -738,6 +750,7 @@ class Operator(object):
                        self.inputs[k].append(v._ivar)
                    elif isinstance(v, list) or isinstance(v, tuple):
                        self.inputs[k].extend([var._ivar for var in v])
+
            self.outputs = defaultdict(list)
            if outputs is not None:
                for k, v in six.iteritems(outputs):
@@ -1187,6 +1200,15 @@ class Block(object):
        else:
            raise ValueError("Var {0} is not found recursively".format(name))

+    def _clear_block(self):
+        # TODO(minqiyang): move this to backward_hooks
+        self.desc._clear_block()
+
+        for name in self.vars.keys():
+            assert self.vars[name].persistable
+
+        del self.ops[:]
+
    def all_parameters(self):
        return list(self.iter_parameters())

@@ -1317,18 +1339,31 @@ class Block(object):
            inputs=kwargs.get("inputs", None),
            outputs=kwargs.get("outputs", None),
            attrs=kwargs.get("attrs", None))
-        self.ops.append(op)

-        # TODO(minqiyang): add stop_gradient support in static mode too.
+        if _in_imperative_mode():
+            # record ops in tracer rather than blocks
+            #
+            # TODO(minqiyang): add op stop_gradient support in static mode too.
            # currently, we only support stop_gradient in imperative mode.
            self._trace_op(op, kwargs.get("stop_gradient", False))
+        self.ops.append(op)
+
        return op

    def _trace_op(self, op, stop_gradient=False):
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
-                                       _imperative_current_expected_place_,
-                                       stop_gradient)
+        backward_refs = _imperative_tracer().trace(
+            op.iop, op.inputs, op.outputs, self.desc,
+            _imperative_current_expected_place_, stop_gradient)
+
+        # TODO(minqiyang): support backward_hooks to eager remove backward_refs
+        op.backward_refs = defaultdict(list)
+        for k, v in six.iteritems(op.inputs):
+            if k in backward_refs:
+                op.backward_refs[k] = op.inputs[k]
+
+        for k, v in six.iteritems(op.outputs):
+            if k in backward_refs:
+                op.backward_refs[k] = op.outputs[k]

    def _insert_op(self, index, *args, **kwargs):
        """
@@ -1383,6 +1418,7 @@ class Block(object):
            outputs=kwargs.get("outputs", None),
            attrs=kwargs.get("attrs", None))
        self.ops.insert(0, op)
+        if _in_imperative_mode():
            self._trace_op(op, kwargs.get("stop_gradient", False))
        return op

@@ -1530,10 +1566,397 @@ class Block(object):
        return ret_var


+class IrNode(object):
+    """
+    Python IrNode. Beneath it is a core.Node, which is used for Ir Pass.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node,
+                          core.Node), 'node must be the instance of core.Node.'
+        self.node = node
+
+    def name(self):
+        """
+        Return the node name.
+
+        Returns:
+            str: node name.
+        """
+        return self.node.name()
+
+    def node_type(self):
+        """
+        Return the node type.
+
+        Returns:
+            core.Node.Type: node type(core.Node.Type.Operation or core.Node.Type.Variable).
+        """
+        return self.node.node_type()
+
+    def var(self):
+        """
+        Return the node variable description.
+
+        Returns:
+            core.VarDesc: node variable description.
+        """
+        return self.node.var()
+
+    def op(self):
+        """
+        Return the node operator description.
+
+        Returns:
+            core.OpDesc: node operator description.
+        """
+        return self.node.op()
+
+    def id(self):
+        """
+        Return the node id.
+
+        Returns:
+            int: node id.
+        """
+        return self.node.id()
+
+    def is_op(self):
+        """
+        If the node is an operator, then return true.
+
+        Returns:
+            bool: indicate whether the node is an operator.
+        """
+        return self.node.is_op()
+
+    def is_var(self):
+        """
+        If the node is a variable, then return true.
+
+        Returns:
+            bool: indicate whether the node is a variable.
+        """
+        return self.node.is_var()
+
+    def is_ctrl_var(self):
+        """
+        If the node is a control dependence variable, then return true.
+
+        Returns:
+            bool: indicate whether the node is a control dependence variable.
+        """
+        return self.node.is_ctrl_var()
+
+    def clear_inputs(self):
+        """
+        Clear the node inputs. After executing the `clear_inputs` function,
+        the node inputs will be empty.
+        """
+        self.node.clear_inputs()
+
+    def remove_input_by_id(self, node_id):
+        """
+        Remove a node from inputs by the given node id.
+
+        Args:
+            node_id(int): the given node id.
+        """
+        self.node.remove_input(node_id)
+
+    def remove_input(self, node):
+        """
+        Remove a node from inputs.
+
+        Args:
+            node(IrNode): the node being removed.
+        """
+        self.node.remove_input(node.node)
+
+    def append_input(self, node):
+        """
+        Append a node in inputs.
+
+        Args:
+            node(IrNode): the node being appended.
+        """
+        self.node.append_input(node.node)
+
+    def clear_outputs(self):
+        """
+        Clear the node outputs. After executing the `clear_outputs` function,
+        the node outputs will be empty.
+        """
+        self.node.clear_outputs()
+
+    def remove_output_by_id(self, node_id):
+        """
+        Remove a node from outputs by the given node id.
+
+        Args:
+            node_id(int): the given node id.
+        """
+        self.node.remove_output(node_id)
+
+    def remove_output(self, node):
+        """
+        Remove a node from outputs.
+
+        Args:
+            node(IrNode): the node being removed.
+        """
+        self.node.remove_output(node.node)
+
+    def append_output(self, node):
+        """
+        Append a node in outputs.
+
+        Args:
+            node(IrNode): the node being appended.
+        """
+        self.node.append_output(node.node)
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrNode): node inputs wrapped by IrNode.
+        """
+        return [IrNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrNode): node outputs wrapped by IrNode.
+        """
+        return [IrNode(n) for n in self.node.outputs]
+
+
+class IrVarNode(IrNode):
+    """
+    Python IrVarNode. Beneath it is a core.Node, it inherits from IrNode.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrVarNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node, core.Node) and node.is_var(), \
+            'node must be the instance of core.Node and it must be a variable node.'
+        super(IrVarNode, self).__init__(node)
+        self.node = node
+
+    def set_shape(self, shape):
+        """
+        Set the node variable shape.
+
+        Args:
+            shape(list): shape to be set.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        self.node.var().set_shape(shape)
+
+    def persistable(self):
+        """
+        If the variable node is a persistable variable, then return true.
+
+        Returns:
+            bool: indicate whether the variable is persistable.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().persistable()
+
+    def type(self):
+        """
+        Return the variable type.
+
+        Returns:
+            core.VarDesc.VarType: the variable type.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().type()
+
+    def dtype(self):
+        """
+        Return the variable data type.
+
+        Returns:
+            core.VarDesc.VarType: the variable data type.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().dtype()
+
+    def shape(self):
+        """
+        Return the variable shape.
+
+        Returns:
+            list: the variable shape.
+        """
+        assert self.node.var() is not None, \
+            "The node variable description cannot be None."
+        return self.node.var().shape()
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrOpNode): node inputs wrapped by IrOpNode.
+        """
+        return [IrOpNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrOpNode): node outputs wrapped by IrOpNode.
+        """
+        return [IrOpNode(n) for n in self.node.outputs]
+
+
+class IrOpNode(IrNode):
+    """
+    Python IrOpNode. Beneath it is a core.Node, it inherits from IrNode.
+    """
+
+    def __init__(self, node):
+        """
+        Construct an IrOpNode using core.Node.
+
+        Args:
+            node(core.Node): C++ Node.
+        """
+        assert isinstance(node, core.Node) and node.is_op(), \
+            'node must be the instance of core.Node and it must be a operator node.'
+        super(IrOpNode, self).__init__(node)
+        self.node = node
+
+    def rename_input(self, old_input_name, new_input_name):
+        """
+        Rename the input of this node.
+
+        Args:
+            old_input_name(str): the old input name.
+            new_input_name(str): the new input name.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        self.node.op()._rename_input(old_input_name, new_input_name)
+
+    def input(self, name):
+        """
+        Get the argument name list by the parameter name for input.
+
+        Args:
+            name(str): the parameter name.
+
+        Returns:
+            list(str): the argument name list.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().input(name)
+
+    def output(self, name):
+        """
+        Get the argument name list by the parameter name for output.
+
+        Args:
+            name(str): the parameter name.
+
+        Returns:
+            list(str): the argument name list.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().output(name)
+
+    def set_type(self, new_type):
+        """
+        Change the operator type into new type.
+
+        Args:
+            new_type(str): new operator type to be set.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        return self.node.op().set_type(new_type)
+
+    def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+        """
+        self._update_desc_attr(name, val)
+
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of the op desc's attribute by attribute's name.
+        """
+        assert self.node.op() is not None, \
+            "The node operator description cannot be None."
+        desc = self.node.op()
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and \
+            all(isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+            isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
+
+    @property
+    def inputs(self):
+        """
+        Return the node inputs.
+
+        Returns:
+            list(IrVarNode): node inputs wrapped by IrVarNode.
+        """
+        return [IrVarNode(n) for n in self.node.inputs]
+
+    @property
+    def outputs(self):
+        """
+        Return the node outputs.
+
+        Returns:
+            list(IrVarNode): node outputs wrapped by IrVarNode.
+        """
+        return [IrVarNode(n) for n in self.node.outputs]
+
+
 class IrGraph(object):
    """
    Python IrGraph. Beneath it is a core.Graph, which is used for
-    create a c++ Ir Pass Graph. An IrGraph is just a graph view of
+    creating a c++ Ir Pass Graph. An IrGraph is just a graph view of
    a Program. In an IrGraph, both Variables and Operators are graph
    nodes.
    """
@@ -1561,15 +1984,15 @@ class IrGraph(object):
        """
        Return all nodes included in the graph as a set.
        """
-        return {node for node in self.graph.nodes()}
+        return {IrNode(node) for node in self.graph.nodes()}

-    def all_vars(self):
+    def all_var_nodes(self):
        """
        Return all variable nodes included in the graph as a set.
        """
-        return {node for node in self.graph.nodes() if node.is_var()}
+        return {IrVarNode(node) for node in self.graph.nodes() if node.is_var()}

-    def all_persistable_vars(self):
+    def all_persistable_nodes(self):
        """
        Return all persistable variable nodes included in the graph as a set.
        """
@@ -1578,13 +2001,13 @@ class IrGraph(object):
            if node.is_var() and node.var() is not None and node.var(
            ).persistable():
                persistable_nodes.add(node)
-        return persistable_nodes
+        return {IrVarNode(p) for p in persistable_nodes}

-    def all_ops(self):
+    def all_op_nodes(self):
        """
        Return all operator nodes included in the graph as a set.
        """
-        return {node for node in self.graph.nodes() if node.is_op()}
+        return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}

    def var_node(self, name):
        """
@@ -1598,14 +2021,14 @@ class IrGraph(object):
            doesn't have a variable with the giving name.

        Returns:
-            core.Node: the variable node with the giving name.
+            IrVarNode: the variable node with the giving name.
        """
        if not isinstance(name, six.string_types):
            raise TypeError(
                "var require string as parameter, but get %s instead." %
                (type(name)))
        target_var_node = None
-        var_nodes = self.all_vars()
+        var_nodes = self.all_var_nodes()
        for var_node in var_nodes:
            if var_node.name() == name:
                target_var_node = var_node
@@ -1613,7 +2036,7 @@ class IrGraph(object):
            raise ValueError("var_node %s not in this graph" % name)
        return target_var_node

-    def create_param_node(self, name, var_type, shape, var_dtype):
+    def create_persistable_node(self, name, var_type, shape, var_dtype):
        """
        Create a persistable variable node in the graph. In IrGraph,
        it can not distinguish between persistable variables and parameters.
@@ -1625,14 +2048,14 @@ class IrGraph(object):
            var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.

        Returns:
-            core.Node: the created persistable variable node.
+            IrVarNode: the created persistable variable node.
        """
        var_desc = core.VarDesc(name)
        var_desc.set_type(var_type)
        var_desc.set_shape(shape)
        var_desc.set_dtype(var_dtype)
        var_desc.set_persistable(True)
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))

    def create_var_node(self, name, var_type, shape, var_dtype):
        """
@@ -1646,14 +2069,14 @@ class IrGraph(object):
            var_dtype(core.VarDesc.VarType): the data type of the variable node.

        Returns:
-            core.Node: the created variable node.
+            IrVarNode: the created variable node.
        """

        var_desc = core.VarDesc(name)
        var_desc.set_type(var_type)
        var_desc.set_shape(shape)
        var_desc.set_dtype(var_dtype)
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))

    def create_var_node_from_desc(self, var_desc):
        """
@@ -1664,9 +2087,9 @@ class IrGraph(object):
            var_desc(core.VarDesc): the giving variable description.

        Returns:
-            core.Node: the created variable node.
+            IrVarNode: the created variable node.
        """
-        return self.graph.create_var_node(var_desc)
+        return IrVarNode(self.graph.create_var_node(var_desc))

    def create_op_node(self, op_type, attrs, inputs, outputs):
        """
@@ -1679,7 +2102,7 @@ class IrGraph(object):
            outputs(dict): the outpus of the operator node.

        Returns:
-            core.Node: the created operator node.
+            IrOpNode: the created operator node.
        """
        op_desc = core.OpDesc()
        op_desc.set_type(op_type)
@@ -1695,7 +2118,7 @@ class IrGraph(object):
                var_nodes = [var_nodes]
            op_desc.set_output(output_name,
                               [var_node.name() for var_node in var_nodes])
-        return self.graph.create_op_node(op_desc)
+        return IrOpNode(self.graph.create_op_node(op_desc))

    def create_op_node_from_desc(self, op_desc):
        """
@@ -1705,40 +2128,40 @@ class IrGraph(object):
            op_desc(core.VarDesc): the giving operator description.

        Returns:
-            core.Node: the created operator node.
+            IrOpNode: the created operator node.
        """
-        return self.graph.create_op_node(op_desc)
+        return IrOpNode(self.graph.create_op_node(op_desc))

    def update_input_link(self, old_input_node, new_input_node, op_node):
        """
        Update the input's link of a operator node.

        Args:
-            old_input_node(core.Node): the old input node of the giving op_node.
-            new_input_node(core.Node): the new input node of the giving op_node.
-            op_node(core.Node): the operator node that is needed to update input's link.
+            old_input_node(IrNode): the old input node of the giving op_node.
+            new_input_node(IrNode): the new input node of the giving op_node.
+            op_node(IrOpNode): the operator node that is needed to update input's link.
        """
-        assert old_input_node in self.graph.nodes() and new_input_node in \
-        self.graph.nodes() and op_node in self.graph.nodes(), \
+        assert old_input_node.node in self.graph.nodes() and new_input_node.node in \
+        self.graph.nodes() and op_node.node in self.graph.nodes(), \
        'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
-        old_input_node.outputs_remove(op_node)
-        op_node.inputs_remove(old_input_node)
-        new_input_node.outputs_append(op_node)
-        op_node.inputs_append(new_input_node)
-        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
+        old_input_node.remove_output(op_node)
+        op_node.remove_input(old_input_node)
+        new_input_node.append_output(op_node)
+        op_node.append_input(new_input_node)
+        op_node.rename_input(old_input_node.name(), new_input_node.name())

    def link_to(self, node_in, node_out):
        """
        Connect two nodes.

        Args:
-            node_in(core.Node): the input node.
-            node_out(core.Node): the output node.
+            node_in(IrNode): the input node.
+            node_out(IrNode): the output node.
        """
-        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
+        assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \
            'The two arguments(node_in&node_out) must be in the graph nodes.'
-        node_in.outputs_append(node_out)
-        node_out.inputs_append(node_in)
+        node_in.append_output(node_out)
+        node_out.append_input(node_in)

    def safe_remove_nodes(self, remove_nodes):
        """
@@ -1753,7 +2176,8 @@ class IrGraph(object):
                remove_nodes = set(remove_nodes)
            else:
                remove_nodes = {remove_nodes}
-        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+        original_nodes = {n.node for n in remove_nodes}
+        core.graph_safe_remove_nodes(self.graph, original_nodes)

    def has_circle(self):
        """
@@ -1780,18 +2204,23 @@ class IrGraph(object):
        Notes: the `graph` cannot contain a circle.

        Returns:
-            set(core.Node): nodes in topology order.
+            set(IrNode): nodes in topology order.
        """
-        return core.topology_sort(self.graph)
+        ordered_nodes = core.topology_sort(self.graph)
+        return {IrNode(n) for n in ordered_nodes}

    def build_adjacency_list(self):
        """
        Build an adjacency list of operations for the `graph`.

        Returns:
-            dict{core.Node: set(core.Node)}: the adjacency list.
+            dict{IrNode: set(IrNode)}: the adjacency list.
        """
-        return core.build_adjacency_list(self.graph)
+        adj_list = core.build_adjacency_list(self.graph)
+        wrapped_adj_list = dict()
+        for k, v in six.iteritems(adj_list):
+            wrapped_adj_list[IrNode(k)] = {IrNode(n) for n in v}
+        return wrapped_adj_list

    def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
        """
@@ -1801,7 +2230,7 @@ class IrGraph(object):
        Args:
            save_path(str): the save path of drawn graph.
            name(str): the name of drawn graph.
-            marked_nodes(set(core.Node)): nodes that are needed to be marked.
+            marked_nodes(set(IrNode)): nodes that are needed to be marked.
            Default value is None.
            remove_ctr_var(bool): If it is set True, all control variable nodes
            in the graph will be removed. Default value is True.
@@ -1816,20 +2245,22 @@ class IrGraph(object):
                print('The {} is saved as the dot filetype.'.format(
                    dot_file_path))

-        if remove_ctr_var:
        remove_ctr_vars = set()
-            for node in self.graph.nodes():
+        if remove_ctr_var:
+            for node in self.all_var_nodes():
                if node.is_ctrl_var():
                    remove_ctr_vars.add(node)
            self.safe_remove_nodes(remove_ctr_vars)
-        ops_num = 0
-        for node in self.graph.nodes():
-            if node.is_op():
-                ops_num += 1
-        print('Total ops num = {}.'.format(ops_num))
+        print('Total ops num = {}.'.format(len(self.all_op_nodes())))
+
        if marked_nodes is not None:
            if not isinstance(marked_nodes, set):
+                if isinstance(marked_nodes, Iterable):
                    marked_nodes = set(marked_nodes)
+                else:
+                    marked_nodes = {marked_nodes}
+            marked_nodes = {n.node for n in marked_nodes}
+            remove_ctr_vars = {n.node for n in remove_ctr_vars}
            marked_nodes = marked_nodes - remove_ctr_vars
            if self.graph.has('__graphviz__marked_node__'):
                self.graph.erase('__graphviz__marked_node__')

--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,7 +17,7 @@ import contextlib
 import sys
 import numpy as np
 import collections
-
+from .. import unique_name
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.imperative import base
@@ -26,14 +26,33 @@ __all__ = ['Layer', 'PyLayer']


 class Layer(core.Layer):
-    """Layers composed of operators."""
+    """Layers composed of operators.
+
+    Args:
+        name_scope: prefix name used by the layer to name parameters.
+            If prefix is "my_model/layer_1", parameter name in MyLayer
+            can be "my_model/layer_1/MyLayer/w_n", where w is the parameter
+            base name and n is an unique suffix auto-generated.
+        dtype: data type for the variables in the layer.
+    """

-    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+    def __init__(self, name_scope, dtype=core.VarDesc.VarType.FP32):
+        self._full_name = unique_name.generate(name_scope + "/" +
+                                               self.__class__.__name__)
        self._built = False
        self._dtype = dtype
        self._parameters = collections.OrderedDict()
        self._sub_layers = collections.OrderedDict()

+    def full_name(self):
+        """Full name for this layers.
+
+          Full name is composed by name_scope + "/" + MyLayer.__class__.__name__
+
+        Returns full name of this name.
+        """
+        return self._full_name
+
    def parameters(self, include_sublayers=True):
        """Returns a list of Parameters from current and sub-layers.


--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']

 class Conv2D(layers.Layer):
    def __init__(self,
+                 name_scope,
                 num_channels,
                 num_filters,
                 filter_size,
@@ -38,19 +39,17 @@ class Conv2D(layers.Layer):
                 act=None,
                 param_attr=None,
                 bias_attr=None,
-                 name=None,
                 dtype=core.VarDesc.VarType.FP32):
        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name=name, dtype=dtype)
+        super(Conv2D, self).__init__(name_scope, dtype=dtype)

        # TODO(minqiyang): Move this to the top.
        from ..layer_helper import LayerHelper
        self._helper = LayerHelper(
-            type(self).__name__,
+            self.full_name(),
            param_attr=param_attr,
            bias_attr=bias_attr,
            dtype=dtype,
-            name=name,
            act=act)

        self._groups = groups
@@ -143,6 +142,7 @@ class Conv2D(layers.Layer):

 class Pool2D(layers.Layer):
    def __init__(self,
+                 name_scope,
                 pool_size=-1,
                 pool_type="max",
                 pool_stride=1,
@@ -151,7 +151,6 @@ class Pool2D(layers.Layer):
                 use_cudnn=True,
                 ceil_mode=False,
                 exclusive=True,
-                 name=None,
                 dtype=core.VarDesc.VarType.FP32):
        if pool_type not in ["max", "avg"]:
            raise ValueError(
@@ -166,10 +165,10 @@ class Pool2D(layers.Layer):
        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")

-        super(Pool2D, self).__init__(name=name, dtype=dtype)
+        super(Pool2D, self).__init__(name_scope, dtype=dtype)

        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+        self._helper = LayerHelper(self.full_name(), dtype=dtype)

        self._pool_type = pool_type
        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
@@ -205,25 +204,24 @@ class Pool2D(layers.Layer):

 class FC(layers.Layer):
    def __init__(self,
+                 name_scope,
                 size,
                 param_attr=None,
                 bias_attr=None,
                 num_flatten_dims=1,
                 dtype=core.VarDesc.VarType.FP32,
-                 act=None,
-                 name=None):
-        super(FC, self).__init__()
+                 act=None):
+        super(FC, self).__init__(name_scope)

        self._size = size
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
        from ..layer_helper import LayerHelper
        self._helper = LayerHelper(
-            'FC',
+            self.full_name(),
            param_attr=param_attr,
            bias_attr=bias_attr,
-            act=act,
-            name=name)
+            act=act)

    def _build_once(self, input):
        input_shape = input.shape
@@ -282,6 +280,7 @@ class FC(layers.Layer):

 class BatchNorm(layers.Layer):
    def __init__(self,
+                 name_scope,
                 num_channels,
                 act=None,
                 is_test=False,
@@ -292,22 +291,20 @@ class BatchNorm(layers.Layer):
                 dtype=core.VarDesc.VarType.FP32,
                 data_layout='NCHW',
                 in_place=False,
-                 name=None,
                 moving_mean_name=None,
                 moving_variance_name=None,
                 do_model_average_for_mean_and_var=False,
                 fuse_with_relu=False,
                 use_global_stats=False):
-        super(BatchNorm, self).__init__()
+        super(BatchNorm, self).__init__(name_scope)

        assert bias_attr is not False, "bias_attr should not be False in batch_norm."

        from ..layer_helper import LayerHelper
        self._helper = LayerHelper(
-            'batch_norm',
+            self.full_name(),
            param_attr=param_attr,
            bias_attr=bias_attr,
-            name=name,
            act=act)

        if dtype == core.VarDesc.VarType.FP16:
@@ -419,6 +416,7 @@ class Embedding(layers.Layer):
    constructor.

    Args:
+        name_scope: See base class.
        size(tuple|list): The shape of the look up table parameter. It should
            have two elements which indicate the size of the dictionary of
            embeddings and the size of each embedding vector respectively.
@@ -446,6 +444,7 @@ class Embedding(layers.Layer):
    """

    def __init__(self,
+                 name_scope,
                 size,
                 is_sparse=False,
                 is_distributed=False,
@@ -453,7 +452,7 @@ class Embedding(layers.Layer):
                 param_attr=None,
                 dtype='float32'):

-        super(Embedding, self).__init__()
+        super(Embedding, self).__init__(name_scope)
        self._size = size
        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
@@ -468,7 +467,7 @@ class Embedding(layers.Layer):
            assert self._is_sparse is True and self._is_distributed is False

        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper('embedding', param_attr=param_attr)
+        self._helper = LayerHelper(self.full_name(), param_attr=param_attr)
        self._w = self._helper.create_parameter(
            attr=self._param_attr,
            shape=self._size,

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -34,6 +34,9 @@ class LayerHelper(object):
        self.kwargs = kwargs
        self.layer_type = layer_type
        name = self.kwargs.get('name', None)
+        # TODO(panyx0718, minqiyang): imperative mode
+        # can not use both `layer_type` and `name`. Deprecate LayerHelper
+        # and write a Helper for imperative mode.
        if name is None:
            self.kwargs['name'] = unique_name.generate(self.layer_type)


--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -551,9 +551,10 @@ def yolov3_loss(x,
          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-        anchors = [0, 1, 2]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
-                                        ignore_thresh=0.5, downsample_ratio=32)
+          anchor_mask = [0, 1, 2]
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+                                          anchor_mask=anchor_mask, class_num=80,
+                                          ignore_thresh=0.7, downsample_ratio=32)
    """
    helper = LayerHelper('yolov3_loss', **locals())


--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -87,6 +87,7 @@ __all__ = [
    'transpose',
    'im2sequence',
    'nce',
+    'sampled_softmax_with_cross_entropy',
    'hsigmoid',
    'beam_search',
    'row_conv',
@@ -668,7 +669,11 @@ def dynamic_lstmp(input,
                  candidate_activation='tanh',
                  proj_activation='tanh',
                  dtype='float32',
-                  name=None):
+                  name=None,
+                  h_0=None,
+                  c_0=None,
+                  cell_clip=None,
+                  proj_clip=None):
    """
    **Dynamic LSTMP Layer**

@@ -785,6 +790,17 @@ def dynamic_lstmp(input,
        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the projection size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        cell_clip(float): If provided the cell state is clipped
+                             by this value prior to the cell output activation.
+        proj_clip(float): If `num_proj > 0` and `proj_clip` is
+                            provided, then the projected values are clipped elementwise to within
+                            `[-proj_clip, proj_clip]`.

    Returns:
        tuple: A tuple of two output variable: the projection of hidden state, \
@@ -831,25 +847,41 @@ def dynamic_lstmp(input,
    batch_hidden = helper.create_variable_for_type_inference(dtype)
    batch_gate = helper.create_variable_for_type_inference(dtype)
    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='lstmp',
-        inputs={
+    inputs = {
        'Input': input,
        'Weight': weight,
        'ProjWeight': proj_weight,
        'Bias': bias
-        },
+    }
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, proj_size), \
+            'The shape of h0 should be (batch_size, %d)' % proj_size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    if cell_clip:
+        assert cell_clip >= 0, "cell_clip should not be negtive."
+    if proj_clip:
+        assert proj_clip >= 0, "proj_clip should not be negtive."
+
+    helper.append_op(
+        type='lstmp',
+        inputs=inputs,
        outputs={
            'Projection': projection,
            'Cell': cell,
-            'OrderedP0': ordered_proj0,
            'BatchHidden': batch_hidden,
            'BatchGate': batch_gate,
            'BatchCellPreAct': batch_cell_pre_act
        },
        attrs={
            'use_peepholes': use_peepholes,
+            'cell_clip': cell_clip,
+            'proj_clip': proj_clip,
            'is_reverse': is_reverse,
            'gate_activation': gate_activation,
            'cell_activation': cell_activation,
@@ -2441,7 +2473,7 @@ def pool2d(input,

          data = fluid.layers.data(
              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool2d = fluid.layers.pool2d(
                            input=data,
                            pool_size=2,
                            pool_type='max',
@@ -2490,6 +2522,7 @@ def pool2d(input,
    return pool_out


+@templatedoc()
 def pool3d(input,
           pool_size=-1,
           pool_type="max",
@@ -2501,13 +2534,19 @@ def pool3d(input,
           name=None,
           exclusive=True):
    """
-    This function adds the operator for pooling in 3-dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}

    Args:
-        input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
-        pool_type (str): ${pooling_type_comment}
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width
+                          of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
+            is a tuple or list, it must contain three integers, 
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        pool_type (string): ${pooling_type_comment}
        pool_stride (int): stride of the pooling layer.
        pool_padding (int): padding size.
        global_pooling (bool): ${global_pooling_comment}
@@ -2520,6 +2559,19 @@ def pool3d(input,

    Returns:
        Variable: output of pool3d layer.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32, 32], dtype='float32')
+          pool3d = fluid.layers.pool3d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
@@ -2569,7 +2621,27 @@ def adaptive_pool2d(input,
                    require_index=False,
                    name=None):
    """
-    ${comment}
+    **Adaptive Pool2d Operator**
+    The adaptive_pool2d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
+    size, C is the number of channels, H is the height of the feature, and W is
+    the width of the feature. Parameters(pool_size) should contain two elements which
+    represent height and width, respectively. Also the H and W dimensions of output(Out)
+    is same as Parameter(pool_size).
+
+    For average adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}

    Args:
        input (Variable): The input tensor of pooling operator. The format of
@@ -2579,8 +2651,8 @@ def adaptive_pool2d(input,
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two integers, (pool_size_Height, pool_size_Width).
        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.

@@ -2661,18 +2733,42 @@ def adaptive_pool3d(input,
                    require_index=False,
                    name=None):
    """
-    ${comment}
+    **Adaptive Pool3d Operator**
+    The adaptive_pool3d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
+    size, C is the number of channels, D is the depth of the feature, H is the height of
+    the feature, and W is the width of the feature. Parameters(pool_size) should contain
+    three elements which represent height and width, respectively. Also the D, H and W
+    dimensions of output(Out) is same as Parameter(pool_size).
+
+    For average adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}

    Args:
        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width of the feature.
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (Depth, Height, Width).
+            it must contain three integers, (Depth, Height, Width).
        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.

@@ -2709,7 +2805,7 @@ def adaptive_pool3d(input,
              name='data', shape=[3, 32, 32], dtype='float32')
          pool_out, mask = fluid.layers.adaptive_pool3d(
                            input=data,
-                            pool_size=[3, 3],
+                            pool_size=[3, 3, 3],
                            pool_type='avg')
    """
    if pool_type not in ["max", "avg"]:
@@ -2945,7 +3041,6 @@ def data_norm(input,
              param_attr=None,
              data_layout='NCHW',
              in_place=False,
-              use_mkldnn=False,
              name=None,
              moving_mean_name=None,
              moving_variance_name=None,
@@ -2979,7 +3074,6 @@ def data_norm(input,
        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
        data_layout(string, default NCHW): NCHW|NHWC
        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
        name(string, Default None): A name for this layer(optional). If set None, the layer
            will be named automatically.
        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
@@ -3060,8 +3154,7 @@ def data_norm(input,
        outputs={"Y": data_norm_out,
                 "Means": means,
                 "Scales": scales},
-        attrs={"epsilon": epsilon,
-               "use_mkldnn": use_mkldnn})
+        attrs={"epsilon": epsilon})

    return helper.append_activation(data_norm_out)

@@ -5765,6 +5858,132 @@ def softmax_with_cross_entropy(logits,
    return loss


+def sampled_softmax_with_cross_entropy(logits,
+                                       label,
+                                       num_samples,
+                                       num_true=1,
+                                       remove_accidental_hits=True,
+                                       use_customized_samples=False,
+                                       customized_samples=None,
+                                       customized_probabilities=None,
+                                       seed=0):
+    """
+    **Sampled Softmax With Cross Entropy Operator.**
+
+    Cross entropy loss with sampled softmax is used as the output layer for 
+    larger output classes extensively. This operator samples a number of samples
+    for all examples, and computes the softmax normalized values for each 
+    row of the sampled tensor, after which cross-entropy loss is computed. 
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+    
+    For examples with T true labels (T >= 1), we assume that each true label has
+    a probability of 1/T. For each sample, S samples are generated using a
+    log uniform distribution. True labels are concatenated with these samples to
+    form T + S samples for each example. So, assume the shape of logits is
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
+    probability is calculated, which corresponds to the Q(y|x) in 
+    [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
+    
+    Logits are sampled according to the sampled labels. Then if 
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+    make its softmax result close to zero. Then sampled logits are subtracted by
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    a softmax with cross entropy.
+
+    Args:
+        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth which is a 2-D tensor. Label is a 
+            Tensor<int64> with shape [N x T], where T is the number of true 
+            labels per example. 
+        num_samples (int): The number for each example, num_samples should be 
+            less than the number of class.
+        num_true(int): The number of target classes per training example.
+        remove_accidental_hits (bool): A flag indicating whether to remove 
+            accidental hits when sampling. If True and if a sample[i, j] 
+            accidentally hits true labels, then the corresponding 
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+            close to zero. Default is True.
+        use_customized_samples (bool): Whether to use custom samples and probabities to sample
+            logits.
+        customized_samples (Variable): User defined samples, which is a 2-D tensor
+            with shape [N, T + S]. S is the num_samples, and T is the number of true 
+            labels per example. 
+        customized_probabilities (Variable): User defined probabilities of samples, 
+            a 2-D tensor which has the same shape with customized_samples.
+        seed (int): The random seed for generating random number, which is used
+            in the process of sampling. Default is 0.
+
+    Returns:
+        Variable: Return the cross entropy loss which is a 2-D tensor with shape
+                  [N x 1].
+
+    Examples:
+        .. code-block:: python
+
+            logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[5], dtype='int64')
+            fc = fluid.layers.fc(input=data, size=100)
+            out = fluid.layers.sampled_softmax_with_cross_entropy(
+                logits=fc, label=label, num_samples=25)
+    """
+    helper = LayerHelper('sample_logits', **locals())
+    samples = helper.create_variable_for_type_inference(dtype='int64')
+    probabilities = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+    sampled_logits \
+        = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    sampled_label = helper.create_variable_for_type_inference(dtype='int64')
+    sampled_softlabel = helper.create_variable_for_type_inference(
+        dtype=logits.dtype)
+
+    helper.append_op(
+        type='sample_logits',
+        inputs={
+            'Logits': logits,
+            'Labels': label,
+            'CustomizedSamples': customized_samples,
+            'CustomizedProbabilities': customized_probabilities
+        },
+        outputs={
+            'Samples': samples,
+            'Probabilities': probabilities,
+            'SampledLabels': sampled_label,
+            'SampledLogits': sampled_logits
+        },
+        attrs={
+            'use_customized_samples': use_customized_samples,
+            'uniq': True,
+            'remove_accidental_hits': remove_accidental_hits,
+            'num_samples': num_samples,
+            'seed': seed
+        })
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(
+        type='one_hot',
+        inputs={'X': sampled_label},
+        attrs={'depth': num_samples + 1},
+        outputs={'Out': sampled_softlabel})
+
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': sampled_logits,
+                'Label': sampled_softlabel},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={
+            'soft_label': True,
+            'ignore_index': False,
+            'numeric_stable_mode': False
+        })
+    return loss / num_true
+
+
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
    """
    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
@@ -9723,6 +9942,7 @@ def teacher_student_sigmoid_loss(input,

    Examples:
        .. code-block:: python
+
          cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
    """
    helper = LayerHelper('teacher_student_sigmoid_loss', **locals())

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
 _uniform_random_ = generate_layer_fn('uniform_random')


-def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
+    """
+    This operator initializes a variable with random values sampled from a
+    uniform distribution. The random result is in set [min, max].
+
+    Args:
+        shape (list): The shape of output variable.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
+            float32, float64 etc. Default: float32.
+        min (float): Minimum value of uniform random. Default -1.0.
+        max (float): Maximun value of uniform random. Default 1.0.
+        seed (int): Random seed used for generating samples. 0 means use a
+            seed generated by the system. Note that if seed is not 0, this
+            operator will always generate the same random numbers every time.
+            Default 0.
+
+    Examples:
+        .. code-block:: python
+
+        result = fluid.layers.uniform_random(shape=[32, 784])
+    """
+
    locals_var = locals().keys()
    if not isinstance(dtype, core.VarDesc.VarType):
        dtype = convert_np_dtype_to_dtype_(dtype)
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
    return _uniform_random_(**kwargs)


-uniform_random.__doc__ = _uniform_random_.__doc__ + """
-Examples:
-
-    >>> result = fluid.layers.uniform_random(shape=[32, 784])
-"""
-
 __all__ += ['hard_shrink']

 _hard_shrink_ = generate_layer_fn('hard_shrink')

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy


-def _is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else framework.default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class ParallelExecutor(object):
    """
    ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
        # it's distributed model.
-        build_strategy.is_distribution = _is_pserver_mode(
+        build_strategy.is_distribution = framework.is_pserver_mode(
            main_program) or num_trainers > 1

        # step4: get main_program, scope, local_scopes
@@ -185,10 +176,13 @@ class ParallelExecutor(object):
        places = list(map(place_obj, self._places))

        # step7: init ParallelExecutor
+        # ParallelExecutor API will be deprecated, don't support parallel graph.
+        self._graph = core.Graph(main.desc)
+
        self.executor = core.ParallelExecutor(
-            places, persistable_vars, main.desc,
+            places, persistable_vars,
            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy)
+            local_scopes, exec_strategy, build_strategy, self._graph)

        self.scope = scope


--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import six
+import unittest
+import time
+import math
+import multiprocessing
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler
+
+# open eager delete mode
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['FLAGS_fast_eager_deletion_mode'] = 'true'
+os.environ['CPU_NUM'] = '2'
+
+
+class BuildIrMemOptBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  network,
+                                  use_cuda=True,
+                                  memory_opt=True,
+                                  use_ir_memory_optimize=True,
+                                  enable_inplace=True,
+                                  iter=5):
+        if use_cuda and not core.is_compiled_with_cuda():
+            print('Skip use_cuda=True because Paddle is not compiled with cuda')
+            return
+
+        if os.name == 'nt':
+            print(
+                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
+            )
+            return
+        fluid.default_startup_program().random_seed = 100
+        fluid.default_main_program().random_seed = 100
+        batch_size = 32
+        batch_size *= fluid.core.get_cuda_device_count() if use_cuda else int(
+            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+        # build network
+        word_dict = paddle.dataset.imdb.word_dict()
+        train_reader = paddle.batch(
+            paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        cost = network(data, label, len(word_dict))
+        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+        optimizer.minimize(cost)
+        if memory_opt:
+            fluid.memory_optimize(fluid.default_main_program())
+
+        # execution
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+        reader = feeder.decorate_reader(train_reader, multi_devices=True)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        train_cp = compiler.CompiledProgram(fluid.default_main_program())
+        train_cp = train_cp.with_data_parallel(loss_name=cost.name)
+        fetch_list = [cost.name]
+
+        begin = time.time()
+        first_loss, last_loss = None, None
+        step_id = 0
+        custom_iter = getattr(self, "iter", None)
+        if not custom_iter == None:
+            iter = custom_iter
+        for data in reader():
+            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
+            print(ret)
+            step_id += 1
+            if step_id == 1:
+                first_loss = ret[0]
+            if step_id == iter:
+                last_loss = ret[0]
+                break
+        end = time.time()
+
+        print("%.4f Instance per second" % (
+            (batch_size * iter) / (end - begin)))
+
+        print(first_loss, last_loss)
+        avg_last_loss_val = np.array(last_loss).mean()
+        avg_first_loss_val = np.array(first_loss).mean()
+        if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                float(avg_first_loss_val)):
+            sys.exit("got NaN loss, training failed.")
+
+        return first_loss, last_loss
+
+
+class TestIrMemOptBase(BuildIrMemOptBase):
+    def setUp(self):
+        self.network = None
+
+    def test_network(self):
+        if self.network is None or not core.is_compiled_with_cuda():
+            return
+
+        baseline_first_loss, baseline_last_loss = None, None
+        for use_cuda in [True]:
+            for use_python_mem_opt in [True, False]:
+                print(
+                    'network: {}, use_cuda: {}, use_python_mem_opt: {}, use_ir_mem_opt : {}'.
+                    format(self.network.__name__, use_cuda, use_python_mem_opt,
+                           not use_python_mem_opt))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        if use_cuda is True and use_python_mem_opt is True:
+                            baseline_first_loss, baseline_last_loss = self.check_network_convergence(
+                                self.network,
+                                use_cuda=use_cuda,
+                                memory_opt=use_python_mem_opt)
+                        else:
+                            cur_first_loss, cur_last_loss = self.check_network_convergence(
+                                self.network,
+                                use_cuda=use_cuda,
+                                memory_opt=use_python_mem_opt)
+
+                            self.assertAlmostEquals(
+                                np.mean(baseline_last_loss),
+                                np.mean(cur_last_loss),
+                                delta=1e-2)
+                            self.assertAlmostEquals(
+                                np.mean(baseline_first_loss),
+                                np.mean(cur_first_loss),
+                                delta=1e-2)
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
+                                            out_grad, x_grad):
+    def __assert_close(tensor, np_array, msg, atol=1e-4):
+        test_case.assertTrue(
+            np.allclose(
+                np.array(tensor), np_array, atol=atol), msg)
+
+    place = core.CPUPlace()
+
+    var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
+    var_names = list(var_dict.keys())
+    ground_truth = {name: var_dict[name] for name in var_names}
+
+    program = fluid.Program()
+    with fluid.program_guard(program):
+        block = program.global_block()
+        for name in ground_truth:
+            block.create_var(
+                name=name, dtype=np.float32, shape=ground_truth[name].shape)
+
+        op = block.append_op(
+            type=op_type,
+            inputs={'X': block.var('x'), },
+            outputs={'Out': block.var('out')},
+            attrs={'use_mkldnn': True})
+
+        # Generate backward op_desc
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
+                                                                  set(), [])
+        grad_op_desc = grad_op_desc_list[0]
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(grad_op_desc)
+        for var_name in grad_op_desc.output_arg_names():
+            block.desc.var(var_name.encode('ascii'))
+        grad_op_desc.infer_var_type(block.desc)
+        grad_op_desc.infer_shape(block.desc)
+        for arg in grad_op_desc.output_arg_names():
+            grad_var = block.desc.find_var(arg.encode('ascii'))
+            grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+        exe = fluid.Executor(place)
+
+        # Do at least 2 iterations
+        for i in range(2):
+            out = exe.run(
+                program,
+                feed={name: var_dict[name]
+                      for name in ['x', 'out@GRAD']},
+                fetch_list=['x@GRAD', 'out'])
+
+        __assert_close(x_grad, out[0], 'x@GRAD')
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -19,7 +19,7 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
-import paddle.fluid as fluid
+from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd


 class TestMKLDNNReluDim2(TestRelu):
@@ -98,62 +98,24 @@ class TestMKLDNNAbsDim4(TestAbs):


 # Check if primitives already exist in backward
-class TestMKLDNNReluPrimitivesAlreadyExist(unittest.TestCase):
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-
-    def test_check_forward_backward(self):
-        place = core.CPUPlace()
+class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
+    def setUp(self):
+        super(TestMKLDNNAbsPrimitivesAlreadyExist, self).setUp()

        np.random.seed(123)
-        x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
-        out = np.abs(x)
-
-        out_grad = np.random.random_sample(x.shape).astype(np.float32)
-        x_grad = out_grad * np.sign(x)  # Abs grad calculation
-
-        var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
-        var_names = list(var_dict.keys())
-        ground_truth = {name: var_dict[name] for name in var_names}
-
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            block = program.global_block()
-            for name in ground_truth:
-                block.create_var(
-                    name=name, dtype='float32', shape=ground_truth[name].shape)
-
-            relu_op = block.append_op(
-                type="abs",
-                inputs={"X": block.var('x'), },
-                outputs={"Out": block.var('out')},
-                attrs={"use_mkldnn": True})
-
-            # Generate backward op_desc
-            grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                relu_op.desc, set(), [])
-            grad_op_desc = grad_op_desc_list[0]
-            new_op_desc = block.desc.append_op()
-            new_op_desc.copy_from(grad_op_desc)
-            for var_name in grad_op_desc.output_arg_names():
-                block.desc.var(var_name.encode("ascii"))
-            grad_op_desc.infer_var_type(block.desc)
-            grad_op_desc.infer_shape(block.desc)
-            for arg in grad_op_desc.output_arg_names():
-                grad_var = block.desc.find_var(arg.encode("ascii"))
-                grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-            exe = fluid.Executor(place)
-
-            # Do at least 2 iterations
-            for i in range(2):
-                out = exe.run(
-                    program,
-                    feed={name: var_dict[name]
-                          for name in ['x', 'out@GRAD']},
-                    fetch_list=['x@GRAD'])
-
-            self.__assert_close(x_grad, out[0], "x@GRAD")
+        self.op_type = 'abs'
+        self.x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
+        self.out = np.abs(self.x)
+        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+        self.x_grad = self.__abs_bwd(self.x, self.out_grad)
+
+    # Abs grad calculation
+    def __abs_bwd(self, x, out_grad):
+        return out_grad * np.sign(x)
+
+    def test_check(self):
+        check_if_mkldnn_primitives_exist_in_bwd(
+            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp, stable_softmax
+from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+# Check if primitives already exist in backward
+class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase):
+    def setUp(self):
+        super(TestSoftmaxMKLDNNPrimitivesAlreadyExist, self).setUp()
+
+        np.random.seed(123)
+        self.op_type = 'softmax'
+        self.x = np.random.uniform(-1, 1, 2).astype(np.float32)
+        self.out = stable_softmax(self.x)
+        self.out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+        self.x_grad = self.__softmax_bwd(self.out, self.out_grad)
+
+    # Softmax grad calculation
+    def __softmax_bwd(self, out, out_grad):
+        return out * (out_grad - np.dot(out, out_grad))
+
+    def test_check(self):
+        check_if_mkldnn_primitives_exist_in_bwd(
+            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
+++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.op_type = "alloc_continuous_space"
+        self.dtype = np.float32
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.FusedOutput = self.init_output(self.Inputs, self.set_constant,
+                                            self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
+        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {"copy_data": True, "set_constant": False, "constant": 0.0}
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = [input[1].flatten() for input in input_list]
+        output = np.concatenate(inputs)
+        if set_constant:
+            output = np.ones((len(output))) * constant
+        return output
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {"copy_data": False, "set_constant": True, "constant": 0.5}
+
+    def test_check_output(self):
+        self.check_output(no_check_set=["Output"])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -20,10 +20,10 @@ from paddle.fluid.layer_helper import LayerHelper


 class L1(fluid.imperative.Layer):
-    def __init__(self):
-        super(L1, self).__init__()
+    def __init__(self, prefix):
+        super(L1, self).__init__(prefix)
        self._helper = LayerHelper(
-            'MyLayer',
+            self.full_name(),
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)))

@@ -43,20 +43,20 @@ class L1(fluid.imperative.Layer):


 class L2(fluid.imperative.Layer):
-    def __init__(self):
-        super(L2, self).__init__()
-        self.layer1 = L1()
-        self.layer2 = L1()
+    def __init__(self, prefix):
+        super(L2, self).__init__(prefix)
+        self.layer1 = L1(self.full_name())
+        self.layer2 = L1(self.full_name())

    def forward(self):
        return self.layer1() + self.layer2()


 class L3(fluid.imperative.Layer):
-    def __init__(self):
-        super(L3, self).__init__()
-        self.layer1 = L2()
-        self.layer2 = L2()
+    def __init__(self, prefix):
+        super(L3, self).__init__(prefix)
+        self.layer1 = L2(self.full_name())
+        self.layer2 = L2(self.full_name())

    def forward(self):
        return self.layer1() + self.layer2()
@@ -65,16 +65,23 @@ class L3(fluid.imperative.Layer):
 class TestBaseLayer(unittest.TestCase):
    def test_one_level(self):
        with fluid.imperative.guard():
-            l = L1()
+            l = L1('test_one_level')
            ret = l()
-            self.assertEqual(l.w1.name, "MyLayer_0.w_0")
-            self.assertEqual(l.w2.name, "MyLayer_0.w_1")
+            self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0")
+            self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1")
            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))

    def test_three_level(self):
        with fluid.imperative.guard():
-            l = L3()
+            l = L3('test_three_level')
+            names = [p.name for p in l.parameters()]
            ret = l()
+            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0")
+            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1")
+            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0")
+            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1")
+            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0")
+            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1")
            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))



--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -56,6 +56,8 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
        train_reader, multi_devices=use_parallel_executor)

    exe = fluid.Executor(place)
+    fluid.default_startup_program().random_seed = 1
+    fluid.default_main_program().random_seed = 1
    exe.run(fluid.default_startup_program())

    train_cp = compiler.CompiledProgram(fluid.default_main_program())

--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -35,7 +35,7 @@ class TestFakeQuantizeOp(OpTest):
        self.check_output()


-class TestFakeQuantizeOp(OpTest):
+class TestFakeQuantizeRangeAbsMaxOp(OpTest):
    def setUp(self):
        self.op_type = "fake_quantize_range_abs_max"
        self.attrs = {
@@ -43,8 +43,10 @@ class TestFakeQuantizeOp(OpTest):
            'window_size': int(1),
            'is_test': False
        }
+        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
+        x = x.astype("float32")
        self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
            'Iter': np.zeros(1).astype("int64"),
            'InScale': np.zeros(1).astype("float32")
        }
@@ -62,5 +64,36 @@ class TestFakeQuantizeOp(OpTest):
        self.check_output()


+class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
+        self.attrs = {
+            'bit_length': int(8),
+            'window_size': int(1),
+            'is_test': True
+        }
+        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
+        x = x.astype("float32")
+        scale = np.max(np.abs(x)).astype("float32") - 1.0
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
+
+        self.inputs = {
+            'X': x,
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': scale.astype("float32")
+        }
+        xs = np.clip(x, -scale, scale)
+        qs = np.round(xs / scale * ((1 << (self.attrs['bit_length'] - 1)) - 1))
+        self.outputs = {
+            'Out': qs,
+            'OutScale': scale.astype("float32"),
+            'OutScales': out_scales,
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -15,7 +15,6 @@
 import contextlib
 import unittest
 import numpy as np
-import sys

 import paddle.fluid as fluid
 from paddle.fluid import core
@@ -24,8 +23,8 @@ from test_imperative_base import new_program_scope


 class MyLayer(fluid.imperative.Layer):
-    def __init__(self):
-        super(MyLayer, self).__init__()
+    def __init__(self, name_scope):
+        super(MyLayer, self).__init__(name_scope)

    def forward(self, inputs):
        x = fluid.layers.relu(inputs)
@@ -50,12 +49,14 @@ class MyPyLayer(fluid.imperative.PyLayer):


 class MLP(fluid.imperative.Layer):
-    def __init__(self):
-        super(MLP, self).__init__()
-        self._fc1 = FC(3,
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(),
+                       3,
                       fluid.ParamAttr(
                           initializer=fluid.initializer.Constant(value=0.1)))
-        self._fc2 = FC(4,
+        self._fc2 = FC(self.full_name(),
+                       4,
                       fluid.ParamAttr(
                           initializer=fluid.initializer.Constant(value=0.1)))

@@ -67,8 +68,9 @@ class MLP(fluid.imperative.Layer):


 class SimpleRNNCell(fluid.imperative.Layer):
-    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
-        super(SimpleRNNCell, self).__init__()
+    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
+                 param_attr):
+        super(SimpleRNNCell, self).__init__(name_scope)
        self.step_input_size = step_input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
@@ -158,10 +160,11 @@ class SimpleRNNCell(fluid.imperative.Layer):


 class SimpleRNN(fluid.imperative.Layer):
-    def __init__(self):
-        super(SimpleRNN, self).__init__()
+    def __init__(self, name_scope):
+        super(SimpleRNN, self).__init__(name_scope)
        self.seq_len = 4
        self._cell = SimpleRNNCell(
+            self.full_name(),
            3,
            3,
            3,
@@ -205,7 +208,7 @@ class TestImperative(unittest.TestCase):
        with fluid.imperative.guard():
            cl = core.Layer()
            cl.forward([])
-            l = fluid.imperative.Layer()
+            l = fluid.imperative.Layer("l")
            self.assertRaises(NotImplementedError, l.forward, [])

    def test_pylayer_func_id(self):
@@ -281,7 +284,7 @@ class TestImperative(unittest.TestCase):
        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
        with fluid.imperative.guard():
            var_inp = fluid.imperative.base.to_variable(np_inp)
-            l = MyLayer()
+            l = MyLayer("my_layer")
            x = l(var_inp)[0]
            self.assertIsNotNone(x)
            dy_out = x._numpy()
@@ -291,7 +294,7 @@ class TestImperative(unittest.TestCase):
        with new_program_scope():
            inp = fluid.layers.data(
                name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer()
+            l = MyLayer("my_layer")
            x = l(inp)[0]
            param_grads = fluid.backward.append_backward(
                x, parameter_list=[l._x_for_debug.name])[0]
@@ -309,7 +312,7 @@ class TestImperative(unittest.TestCase):
        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
        with fluid.imperative.guard():
            var_inp = fluid.imperative.base.to_variable(np_inp)
-            mlp = MLP()
+            mlp = MLP("mlp")
            out = mlp(var_inp)
            dy_out = out._numpy()
            out._backward()
@@ -318,7 +321,7 @@ class TestImperative(unittest.TestCase):
        with new_program_scope():
            inp = fluid.layers.data(
                name="inp", shape=[2, 2], append_batch_size=False)
-            mlp = MLP()
+            mlp = MLP("mlp")
            out = mlp(inp)
            param_grads = fluid.backward.append_backward(
                out, parameter_list=[mlp._fc1._w.name])[0]
@@ -334,10 +337,10 @@ class TestImperative(unittest.TestCase):
        self.assertTrue(np.allclose(dy_grad, static_grad))

        params = mlp.parameters(True)
-        self.assertEqual("FC_0.w_0", params[0].name)
-        self.assertEqual("FC_0.b_0", params[1].name)
-        self.assertEqual("FC_1.w_0", params[2].name)
-        self.assertEqual("FC_1.b_0", params[3].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name)
+        self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name)
+        self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name)
        self.assertEqual(len(params), 4)

        sublayers = mlp.sublayers(True)
@@ -353,7 +356,7 @@ class TestImperative(unittest.TestCase):
        with fluid.imperative.guard():
            var_inp = fluid.imperative.base.to_variable(np_inp)
            var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
            outs, pre_hiddens = simple_rnn.forward(var_inp)
            dy_out = outs[3]._numpy()
            outs[3]._backward()
@@ -364,7 +367,7 @@ class TestImperative(unittest.TestCase):
        with new_program_scope():
            inp = fluid.layers.data(
                name="inp", shape=[1, 4, 3], append_batch_size=False)
-            simple_rnn = SimpleRNN()
+            simple_rnn = SimpleRNN("simple_rnn")
            outs, pre_hiddens = simple_rnn(inp)
            param_grads = fluid.backward.append_backward(outs[3])
            exe = fluid.Executor(fluid.CPUPlace())

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -28,10 +28,10 @@ from paddle.fluid.imperative.base import to_variable


 class Discriminator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Discriminator, self).__init__()
-        self._fc1 = FC(size=32, act='elu', name="d_fc1")
-        self._fc2 = FC(size=1, name="d_fc2")
+    def __init__(self, name_scope):
+        super(Discriminator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=32, act='elu')
+        self._fc2 = FC(self.full_name(), size=1)

    def forward(self, inputs):
        x = self._fc1(inputs)
@@ -39,11 +39,11 @@ class Discriminator(fluid.imperative.Layer):


 class Generator(fluid.imperative.Layer):
-    def __init__(self):
-        super(Generator, self).__init__()
-        self._fc1 = FC(size=64, act='elu', name="g_fc1")
-        self._fc2 = FC(size=64, act='elu', name="g_fc2")
-        self._fc3 = FC(size=1, name="g_fc3")
+    def __init__(self, name_scope):
+        super(Generator, self).__init__(name_scope)
+        self._fc1 = FC(self.full_name(), size=64, act='elu')
+        self._fc2 = FC(self.full_name(), size=64, act='elu')
+        self._fc3 = FC(self.full_name(), size=1)

    def forward(self, inputs):
        x = self._fc1(inputs)
@@ -65,8 +65,8 @@ class TestImperativeMnist(unittest.TestCase):
        scope = fluid.core.Scope()
        with new_program_scope(
                main=discriminate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")

            img = fluid.layers.data(
                name="img", shape=[2, 1], append_batch_size=False)
@@ -93,8 +93,8 @@ class TestImperativeMnist(unittest.TestCase):
            sgd.minimize(d_loss)

        with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")

            noise = fluid.layers.data(
                name="noise", shape=[2, 2], append_batch_size=False)
@@ -134,8 +134,8 @@ class TestImperativeMnist(unittest.TestCase):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

-            discriminator = Discriminator()
-            generator = Generator()
+            discriminator = Discriminator("d")
+            generator = Generator("g")
            sgd = SGDOptimizer(learning_rate=1e-3)

            d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,6 +28,7 @@ from test_imperative_base import new_program_scope

 class SimpleImgConvPool(fluid.imperative.Layer):
    def __init__(self,
+                 name_scope,
                 num_channels,
                 num_filters,
                 filter_size,
@@ -44,9 +45,10 @@ class SimpleImgConvPool(fluid.imperative.Layer):
                 use_cudnn=False,
                 param_attr=None,
                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__()
+        super(SimpleImgConvPool, self).__init__(name_scope)

        self._conv2d = Conv2D(
+            self.full_name(),
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
@@ -59,6 +61,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
            use_cudnn=use_cudnn)

        self._pool2d = Pool2D(
+            self.full_name(),
            pool_size=pool_size,
            pool_type=pool_type,
            pool_stride=pool_stride,
@@ -73,19 +76,20 @@ class SimpleImgConvPool(fluid.imperative.Layer):


 class MNIST(fluid.imperative.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__()
+    def __init__(self, name_scope, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__(name_scope)

        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")

        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")

        pool_2_shape = 50 * 4 * 4
        SIZE = 10
        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(10,
+        self._fc = FC(self.full_name(),
+                      10,
                      param_attr=fluid.param_attr.ParamAttr(
                          initializer=fluid.initializer.NormalInitializer(
                              loc=0.0, scale=scale)),
@@ -101,25 +105,24 @@ class MNIST(fluid.imperative.Layer):
 class TestImperativeMnist(unittest.TestCase):
    def test_mnist_float32(self):
        seed = 90
-        batch_num = 2
+        epoch_num = 1
        with fluid.imperative.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

-            mnist = MNIST()
+            mnist = MNIST("mnist")
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)

            dy_param_init_value = {}
+            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
                    dy_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    128, 1)
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)

                    img = to_variable(dy_x_data)
                    label = to_variable(y_data)
@@ -128,19 +131,21 @@ class TestImperativeMnist(unittest.TestCase):
                    cost = mnist(img)
                    loss = fluid.layers.cross_entropy(cost, label)
                    avg_loss = fluid.layers.mean(loss)
+
                    dy_out = avg_loss._numpy()

-                if batch_id == 0:
-                    for param in fluid.default_main_program().global_block(
-                    ).all_parameters():
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
                            dy_param_init_value[param.name] = param._numpy()

                    avg_loss._backward()
                    sgd.minimize(avg_loss)
                    mnist.clear_gradients()
+
+                    fluid.default_main_program().global_block()._clear_block()
+
                    dy_param_value = {}
-                for param in fluid.default_main_program().global_block(
-                ).all_parameters():
+                    for param in mnist.parameters():
                        dy_param_value[param.name] = param._numpy()

        with new_program_scope():
@@ -150,10 +155,10 @@ class TestImperativeMnist(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

-            mnist = MNIST()
+            mnist = MNIST("mnist")
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)

            img = fluid.layers.data(
                name='pixel', shape=[1, 28, 28], dtype='float32')
@@ -166,8 +171,7 @@ class TestImperativeMnist(unittest.TestCase):
            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in mnist.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(fluid.default_startup_program(),
@@ -176,18 +180,18 @@ class TestImperativeMnist(unittest.TestCase):
            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

+            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
                    static_x_data = np.array(
-                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])

                    fetch_list = [avg_loss.name]
                    fetch_list.extend(static_param_name_list)
-                out = exe.run(fluid.default_main_program(),
+                    out = exe.run(
+                        fluid.default_main_program(),
                        feed={"pixel": static_x_data,
                              "label": y_data},
                        fetch_list=fetch_list)
@@ -195,7 +199,10 @@ class TestImperativeMnist(unittest.TestCase):
                    static_param_value = {}
                    static_out = out[0]
                    for i in range(1, len(out)):
-                    static_param_value[static_param_name_list[i - 1]] = out[i]
+                        static_param_value[static_param_name_list[i - 1]] = out[
+                            i]
+
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))

        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
@@ -203,7 +210,7 @@ class TestImperativeMnist(unittest.TestCase):
        self.assertTrue(np.allclose(static_out, dy_out))

        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -28,12 +28,13 @@ from paddle.fluid.backward import append_backward

 class SimpleLSTMRNN(fluid.imperative.Layer):
    def __init__(self,
+                 name_scope,
                 hidden_size,
                 num_steps,
                 num_layers=2,
                 init_scale=0.1,
                 dropout=None):
-        super(SimpleLSTMRNN, self).__init__()
+        super(SimpleLSTMRNN, self).__init__(name_scope)
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._init_scale = init_scale
@@ -130,13 +131,14 @@ class SimpleLSTMRNN(fluid.imperative.Layer):

 class PtbModel(fluid.imperative.Layer):
    def __init__(self,
+                 name_scope,
                 hidden_size,
                 vocab_size,
                 num_layers=2,
                 num_steps=20,
                 init_scale=0.1,
                 dropout=None):
-        super(PtbModel, self).__init__()
+        super(PtbModel, self).__init__(name_scope)
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.init_scale = init_scale
@@ -146,12 +148,14 @@ class PtbModel(fluid.imperative.Layer):
        from paddle.fluid.layer_helper import LayerHelper
        self._helper = LayerHelper('PtbModel', act="tanh")
        self.simple_lstm_rnn = SimpleLSTMRNN(
+            self.full_name(),
            hidden_size,
            num_steps,
            num_layers=num_layers,
            init_scale=init_scale,
            dropout=dropout)
        self.embedding = Embedding(
+            self.full_name(),
            size=[vocab_size, hidden_size],
            dtype='float32',
            is_sparse=False,
@@ -226,6 +230,7 @@ class TestImperativePtbRnn(unittest.TestCase):
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(
+                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,
@@ -265,6 +270,7 @@ class TestImperativePtbRnn(unittest.TestCase):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            ptb_model = PtbModel(
+                "ptb_model",
                hidden_size=hidden_size,
                vocab_size=vocab_size,
                num_layers=num_layers,

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -70,15 +70,17 @@ def optimizer_setting(params):

 class ConvBNLayer(fluid.imperative.Layer):
    def __init__(self,
+                 name_scope,
                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 groups=1,
                 act=None):
-        super(ConvBNLayer, self).__init__()
+        super(ConvBNLayer, self).__init__(name_scope)

        self._conv = Conv2D(
+            self.full_name(),
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
@@ -88,7 +90,7 @@ class ConvBNLayer(fluid.imperative.Layer):
            act=None,
            bias_attr=None)

-        self._batch_norm = BatchNorm(num_filters, act=act)
+        self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act)

    def forward(self, inputs):
        y = self._conv(inputs)
@@ -98,21 +100,29 @@ class ConvBNLayer(fluid.imperative.Layer):


 class BottleneckBlock(fluid.imperative.Layer):
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
-        super(BottleneckBlock, self).__init__()
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True):
+        super(BottleneckBlock, self).__init__(name_scope)

        self.conv0 = ConvBNLayer(
+            self.full_name(),
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            act='relu')
        self.conv1 = ConvBNLayer(
+            self.full_name(),
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu')
        self.conv2 = ConvBNLayer(
+            self.full_name(),
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
@@ -120,6 +130,7 @@ class BottleneckBlock(fluid.imperative.Layer):

        if not shortcut:
            self.short = ConvBNLayer(
+                self.full_name(),
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
@@ -141,13 +152,13 @@ class BottleneckBlock(fluid.imperative.Layer):

        y = fluid.layers.elementwise_add(x=short, y=conv2)

-        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        layer_helper = LayerHelper(self.full_name(), act='relu')
        return layer_helper.append_activation(y)


 class ResNet(fluid.imperative.Layer):
-    def __init__(self, layers=50, class_dim=102):
-        super(ResNet, self).__init__()
+    def __init__(self, name_scope, layers=50, class_dim=102):
+        super(ResNet, self).__init__(name_scope)

        self.layers = layers
        supported_layers = [50, 101, 152]
@@ -163,9 +174,18 @@ class ResNet(fluid.imperative.Layer):
        num_filters = [64, 128, 256, 512]

        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+            self.full_name(),
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu')
        self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.full_name(),
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')

        self.bottleneck_block_list = []
        num_channels = 64
@@ -175,6 +195,7 @@ class ResNet(fluid.imperative.Layer):
                bottleneck_block = self.add_sublayer(
                    'bb_%d_%d' % (block, i),
                    BottleneckBlock(
+                        self.full_name(),
                        num_channels=num_channels,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
@@ -184,12 +205,13 @@ class ResNet(fluid.imperative.Layer):
                shortcut = True

        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+            self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)

        import math
        stdv = 1.0 / math.sqrt(2048 * 1.0)

-        self.out = FC(size=class_dim,
+        self.out = FC(self.full_name(),
+                      size=class_dim,
                      act='softmax',
                      param_attr=fluid.param_attr.ParamAttr(
                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
@@ -209,12 +231,12 @@ class TestImperativeResnet(unittest.TestCase):
        seed = 90

        batch_size = train_parameters["batch_size"]
-        batch_num = 1
+        batch_num = 2
        with fluid.imperative.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

-            resnet = ResNet()
+            resnet = ResNet("resnet")
            optimizer = optimizer_setting(train_parameters)
            np.random.seed(seed)
            import random
@@ -264,6 +286,8 @@ class TestImperativeResnet(unittest.TestCase):
                optimizer.minimize(avg_loss)
                resnet.clear_gradients()

+                fluid.default_main_program().global_block()._clear_block()
+
                dy_param_value = {}
                for param in resnet.parameters():
                    dy_param_value[param.name] = param._numpy()
@@ -275,7 +299,7 @@ class TestImperativeResnet(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

-            resnet = ResNet()
+            resnet = ResNet("resnet")
            optimizer = optimizer_setting(train_parameters)

            np.random.seed(seed)
@@ -297,11 +321,9 @@ class TestImperativeResnet(unittest.TestCase):
            static_param_init_value = {}
            static_param_name_list = []
            static_grad_name_list = []
-            for param in fluid.default_startup_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                static_param_name_list.append(param.name)
-            for param in fluid.default_main_program().global_block(
-            ).all_parameters():
+            for param in resnet.parameters():
                if not param.stop_gradient:
                    static_grad_name_list.append(param.name +
                                                 core.grad_var_suffix())

--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
+
+from __future__ import print_function
+
+import paddle.fluid as fluid
+import unittest
+from ir_memory_optimize_net_base import TestIrMemOptBase
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class TestIrMemOptRNN(TestIrMemOptBase):
+    def setUp(self):
+        self.network = lstm_net
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -13,21 +13,44 @@
 # limitations under the License.

 import os
+import sys
 import unittest
+from timeit import default_timer as timer
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.dataset.wmt16 as wmt16

 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
 os.environ[
    'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'

-from test_parallel_executor_transformer import TestTransformer
-from test_parallel_executor_transformer import transformer
+from test_parallel_executor_transformer import transformer, ModelHyperParams, transformer_model, transformer, prepare_batch_input
+from parallel_executor_test_base import TestParallelExecutorBase


 # NOTE(dzhwinter): test diferent strategy colisions.
 # open the eager delete tensor strategy by default.
-class TestTransformerWithIR(TestTransformer):
+class TestTransformerWithIR(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                os.environ.get("RECORDIO_FILENAME")) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
    def test_main(self):
        if core.is_compiled_with_cuda():
            # check python transpiler
@@ -35,13 +58,15 @@ class TestTransformerWithIR(TestTransformer):
                transformer,
                use_cuda=True,
                memory_opt=True,
-                use_ir_memory_optimize=False)
+                use_ir_memory_optimize=False,
+                iter=2)
            # check IR memory optimize
            self.check_network_convergence(
                transformer,
                use_cuda=True,
                memory_opt=False,
-                use_ir_memory_optimize=True)
+                use_ir_memory_optimize=True,
+                iter=2)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))

+    def test_sampled_softmax_with_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            logits = layers.data(name='Logits', shape=[256], dtype='float64')
+            label = layers.data(name='Label', shape=[1], dtype='int64')
+            num_samples = 25
+            output = layers.sampled_softmax_with_cross_entropy(logits, label,
+                                                               num_samples)
+            self.assertIsNotNone(output)
+        print(str(program))
+
    @decorators.prog_scope()
    def test_nce(self):
        window_size = 5

--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -36,12 +36,14 @@ def lstmp(
        w_b=None,  # 1 x 4D
        w_c=None,  # 1 x 3D
        is_reverse=False,
+        proj_clip=0.0,
+        cell_clip=0.0,
        act_gate=None,
        act_cell=None,
        act_cand=None,
        act_proj=None):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
-              act_proj):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
+              act_cell, act_cand, act_proj):
        g = np.dot(r_pre, w_r)  # 1 x 4D
        g = g + x
        g = np.reshape(g, (1, g.size))
@@ -55,6 +57,17 @@ def lstmp(
            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D

+        def array_clip(a, clip):
+            size = np.prod(a.shape)
+            new_a = np.reshape(a, (size))
+            for i in range(size):
+                new_a[i] = max(new_a[i], -1.0 * clip)
+                new_a[i] = min(new_a[i], clip)
+            new_a = np.reshape(new_a, a.shape)
+            return new_a
+
+        if cell_clip > 0.0:
+            c = array_clip(c, cell_clip)
        if w_c is None:
            g_o = act_gate(g_o)  # 1 x D
        else:
@@ -64,6 +77,8 @@ def lstmp(
        # projection
        r = np.dot(h, w_rh)
        r = act_proj(r)
+        if proj_clip > 0.0:
+            r = array_clip(r, proj_clip)
        return r, c

    def _reverse(x, offset):
@@ -87,13 +102,13 @@ def lstmp(
        # compute one sequence
        seq_len = lod[0][i]
        x = input[offset[i]:offset[i + 1], :]
-        r_pre = np.dot(h0[i], w_rh)  # 1 x P
-        r_pre = act_proj(r_pre)
+        r_pre = h0[i]
        c_pre = c0[i]  # 1 x D
        for j in range(seq_len):
            # compute one step
-            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
-                                 act_cell, act_cand, act_proj)
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
+                                 cell_clip, act_gate, act_cell, act_cand,
+                                 act_proj)
            projection.append(r_pre.flatten())
            cell.append(c_pre.flatten())

@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):

        T = sum(self.lod[0])
        N = len(self.lod[0])
-
        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
        if self.has_initial_state:
-            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            h0 = np.random.normal(size=(N, self.P)).astype('float64')
            c0 = np.random.normal(size=(N, self.D)).astype('float64')
        else:
-            h0 = np.zeros((N, self.D)).astype('float64')
+            h0 = np.zeros((N, self.P)).astype('float64')
            c0 = np.zeros((N, self.D)).astype('float64')
        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
        if self.use_peepholes:
@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
        w_b = b[:, 0:4 * self.D]
        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        proj_clip = 0.1
+        cell_clip = 0.1
        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+                     proj_clip, cell_clip, ACTIVATION[self.act_gate],
+                     ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
+                     ACTIVATION[self.act_proj])

        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}

@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
        self.attrs = {
            'use_peepholes': self.use_peepholes,
            'is_reverse': self.is_reverse,
+            'proj_clip': proj_clip,
+            'cell_clip': cell_clip,
            'gate_activation': self.act_gate,
            'cell_activation': self.act_cell,
            'candidate_activation': self.act_cand,
@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
    def test_check_grad(self):
        # TODO(qingqing) remove folowing lines after the check_grad is refined.
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
            (N, self.D)).astype('float64')
        self.check_grad(
            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            max_relative_error=1e-2)
+            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005)


 class TestLstmpOpHasInitial(TestLstmpOp):
@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
    def test_check_grad(self):
        # TODO(qingqing) remove folowing lines after the check_grad is refined.
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
            ['Projection'],
+            numeric_grad_delta=0.0000005,
            max_relative_error=1e-2)

    def test_check_grad_ingore_bias(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('Bias'))

    def test_check_grad_ingore_weight(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('Weight'))

    def test_check_grad_ingore_proj_weight(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'Weight', 'Bias'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('ProjWeight'))

    def test_check_grad_ingore_input(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('Input'))

    def test_check_grad_ingore_h0(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('H0'))

    def test_check_grad_ingore_c0(self):
        N = len(self.lod[0])
-        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
        self.outputs['BatchCellPreAct'] = np.zeros(
@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
        self.check_grad(
            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
            max_relative_error=1e-2,
+            numeric_grad_delta=0.0000005,
            no_grad_set=set('C0'))



--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -144,15 +144,5 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
        return [2, 3, 4, 5]


-class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
-    def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import os
+import six


 class PlotData(object):
@@ -72,8 +73,8 @@ class Ploter(object):
                plot_curve = Ploter("Curve 1","Curve 2")
                plot_curve.append(title="Curve 1",step=1,value=1)
        """
-        assert isinstance(title, basestring)
-        assert self.__plot_data__.has_key(title)
+        assert isinstance(title, six.string_types)
+        assert title in self.__plot_data__
        data = self.__plot_data__[title]
        assert isinstance(data, PlotData)
        data.append(step, value)

--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -122,7 +122,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
    def create_dataset_from_list(self, path):
        data = []
        label_set = []
-        for line in open(file_list):
+        for line in open(path):
            items = line.rstrip.split()
            image_path = items[0]
            label_name = items[1]
@@ -141,7 +141,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
        path: the path of the image dataset.
        """
        if self.from_list:
-            return create_dataset_from_list(path)
+            return self.create_dataset_from_list(path)
        label_set = preprocess_util.get_label_set_from_dir(path)
        data = []
        for l_name in list(label_set.keys()):