Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rewrite_allocation

461f71a9 · Yu Yang · 9dcddf92 · 0e241384 · 461f71a9 · 461f71a9
197 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,6 +127,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
  "A path setting fluid shared and static libraries")
+set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
+  "A path setting fluid inference shared and static libraries")
 if (WITH_C_API AND WITH_PYTHON)
  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
    "when using C-API. It will give an unpredictable behavior when using a "

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
+### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==0.15.0.post87
+pip install paddlepaddle-gpu==1.0.1.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==0.15.0.post85
+pip install paddlepaddle-gpu==1.0.1.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
   We appreciate your contributions!

--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -311,6 +311,8 @@ function(cc_test TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction(cc_test)
@@ -629,6 +631,8 @@ function(py_test TARGET_NAME)
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
  endif()
 endfunction()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -150,16 +150,16 @@ if (WITH_ANAKIN AND WITH_MKL)
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${src_dir}/${module}/api/paddle_inference_api.h
       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 set(module "platform")
@@ -185,20 +185,41 @@ copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
  DSTS ${FLUID_INSTALL_DIR})
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
+# Following commands generate a inference-only fluid library
+# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
+copy(third_party DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+)
+# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+copy(inference_api_lib DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)
+add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 # paddle fluid version
-execute_process(
+function(version version_file)
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  execute_process(
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-file(WRITE ${version_file}
+  file(WRITE ${version_file}
-  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-  "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
-  "WITH_GPU: ${WITH_GPU}\n")
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-if(WITH_GPU)
+    "WITH_GPU: ${WITH_GPU}\n")
-  file(APPEND ${version_file}
+  if(WITH_GPU)
-    "CUDA version: ${CUDA_VERSION}\n"
+    file(APPEND ${version_file}
-    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+      "CUDA version: ${CUDA_VERSION}\n"
-endif()
+      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+  endif()
+endfunction()
+version(${FLUID_INSTALL_DIR}/version.txt)
+version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
 paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -95,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
@@ -114,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var
 paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
 paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
+paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
 paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
@@ -127,6 +130,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
@@ -170,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -178,7 +183,6 @@ paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, k
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
  const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
  }
  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -49,6 +49,8 @@ struct VarHandleBase {
  void AddOutput(OpHandleBase* out, ir::Node* node) {
    if (pending_ops_.find(out) == pending_ops_.end()) {
+      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
+                     this->Node()->Name());
      pending_ops_.insert(out);
      node_->outputs.push_back(node);
    }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+  handler(op->Inputs());
+  handler(op->Outputs());
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-      for (auto& input : op->Inputs()) {
+                          &(ctx->cur_ref_cnts_));
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
    }
  }

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
    const ProgramDesc& prog, size_t block_id) {
  auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
  std::unordered_map<std::string, T> ref_cnts;
-  for (auto var_desc : block.AllVars()) {
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    auto type = var_desc->Proto()->type().type();
+    for (auto& name_pair : name_map) {
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
+      for (auto& name : name_pair.second) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
+        auto* var_desc = block.FindVar(name);
-    }
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
-  }
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
-  for (auto op_desc : block.AllOps()) {
+            type != proto::VarType::SELECTED_ROWS) {
-    for (auto& input : op_desc->Inputs()) {
+          continue;
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
        }
-      }
-    }
-    for (auto& output : op_desc->Outputs()) {
+        auto it = ref_cnts.find(name);
-      for (auto output_name : output.second) {
+        if (it != ref_cnts.end()) {
-        if (!ignored_vars.count(output_name)) {
+          ++it->second;
-          if (ref_cnts.count(output_name))
+        } else {
-            ++ref_cnts[output_name];
+          ref_cnts[name] = 1;
-          else
-            ref_cnts[output_name] = 1;
        }
      }
    }
+  };
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
  }
  return ref_cnts;
 }

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
  // be created.
  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
  if (index >= feed_inputs.size()) {
    feed_inputs.resize(index + 1);
  }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,7 +10,7 @@ function(pass_library TARGET DEST)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
        message(STATUS "add pass ${TARGET} ${DEST}")
@@ -25,21 +25,29 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
+cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if (WITH_MKLDNN)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
 pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
+pass_library(conv_bn_fuse_pass inference)
+pass_library(seqconv_eltadd_relu_fuse_pass inference)
+if(WITH_MKLDNN)
+    pass_library(mkldnn_placement_pass base)
+    pass_library(conv_bias_mkldnn_fuse_pass inference)
+    pass_library(conv_relu_mkldnn_fuse_pass inference)
+endif()
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+if(WITH_MKLDNN)
+  pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+endif()
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
@@ -53,4 +61,5 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -262,7 +262,7 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
  std::unordered_set<std::string> specified_vars({"data_lod_attention",
                                                  "cell_init", "hidden_init",
                                                  "data", "week", "minute"});
-  int count = 0;
+  size_t count = 0;
  for (auto* node : graph->Nodes()) {
    if (node->IsVar() && specified_vars.count(node->Name())) {
      ++count;

--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <typename BinaryOperation>
+LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
+                               BinaryOperation f) {
+  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
+  LoDTensor vec_y;
+  vec_y.Resize(vec_a.dims());
+  const float* a = vec_a.data<float>();
+  const float* b = vec_b.data<float>();
+  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < vec_a.numel(); i++) {
+    y[i] = f(a[i], b[i]);
+  }
+  return vec_y;
+}
+std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bias_pattern(conv_input);
+  int found_conv_bias_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBias fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_bias_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
+    // bias
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
+    // output
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
+    // elementwise_add op
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
+    if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
+      VLOG(3) << "do not perform conv+bias fuse";
+      return;
+    }
+    auto* eltwise_bias_tensor =
+        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
+    auto input_names = conv->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+      auto conv_bias_names = conv->Op()->Input("Bias");
+      // add eltwise bias to existing conv bias
+      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+      auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+      auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
+      *conv_bias_tensor = tensor_apply_eltwise(
+          *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({eltwise_out->Name()}));
+      GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out});
+      IR_NODE_LINK_TO(conv, eltwise_out);
+    } else {
+      // take eltwise bias as conv bias
+      OpDesc desc;
+      desc.SetInput(
+          "Input", std::vector<std::string>({subgraph.at(conv_input)->Name()}));
+      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
+      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
+      desc.SetType("conv2d");
+      for (auto& attr : conv->Op()->GetAttrMap()) {
+        desc.SetAttr(attr.first, attr.second);
+      }
+      auto conv_bias_node = g->CreateOpNode(&desc);
+      IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
+      IR_NODE_LINK_TO(conv_weight, conv_bias_node);
+      IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
+      IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
+      GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
+    }
+    found_conv_bias_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bias_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvBiasFusePass);
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+* Fuse the Conv and Elementwise_add to a ConvBiasOp.
+*/
+class ConvBiasFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBiasFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_CONV_BN_NODES(pattern_name)                                      \
+  /* OPERATORS */                                                            \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                       \
+  GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name);           \
+  /* CONV inputs */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);         \
+  /* CONV outputs */                                                         \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);               \
+  /* BN inputs */                                                            \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name);               \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name);                 \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name);                 \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name);         \
+  /* BN outputs */                                                           \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name); /* Out */         \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name);         \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
+  GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
+void recompute_bias_and_weights(const Scope* scope,
+                                ir::Node* conv_weight,            //
+                                const ir::Node& bn_scale,         //
+                                const LoDTensor& bn_bias_tensor,  //
+                                const ir::Node& bn_mean,          //
+                                const ir::Node& bn_variance,      //
+                                LoDTensor* eltwise_y_in_tensor,   //
+                                float epsilon) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  // Re-compute bias of conv2d from BN
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
+  auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
+  auto* variance_tensor =
+      scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
+  auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  EigenVectorArrayMap variance_array(
+      variance_tensor->mutable_data<float>(platform::CPUPlace()),
+      variance_tensor->numel(), 1);
+  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
+                                      mean_tensor->numel(), 1);
+  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
+                                         bn_bias_tensor.numel(), 1);
+  // variance will not be used anymore, so make it std_array and then tmp_array
+  variance_array += epsilon;
+  variance_array = variance_array.sqrt();
+  variance_array = scale_array / variance_array;
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+  eltwise_y_in_array =
+      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
+  // Re-compute weight of conv2d from BN
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+  weights_array_2d.colwise() *= variance_array;
+}
+std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bn_pattern(conv_input, false /*with_eltwise_add*/);
+  int found_conv_bn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+    // conv, batch_norm,
+    // conv_weight, conv_out,
+    // bn_scale, bn_bias, bn_mean, bn_variance,
+    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,
+    // bn_saved_variance
+    GET_CONV_BN_NODES(conv_bn_pattern);
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+bn fuse";
+      return;
+    }
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetPersistable(true);
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+    // Get batch norm bias
+    auto* bn_bias_tensor =
+        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
+    // Initialize eltwise_y
+    eltwise_y_in_tensor->Resize(bn_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+    // update weights and biases
+    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
+    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
+                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
+                               epsilon);
+    // with MKL-DNN fuse conv+bn into conv with bias
+    // without MKL-DNN fuse conv+bn into conv+elementwise_add
+    if (fuse_option == FUSE_MKLDNN) {
+      auto input_names = conv->Op()->InputNames();
+      bool has_bias = std::find(input_names.begin(), input_names.end(),
+                                "Bias") != input_names.end();
+      if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+        // reuse existing conv bias node
+        auto conv_bias_names = conv->Op()->Input("Bias");
+        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+        auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+        auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
+                          eltwise_y_in_tensor->dims());
+        auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
+        eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
+      } else {
+        // add new conv_bias node
+        conv->Op()->SetInput(
+            "Bias", std::vector<std::string>({eltwise_y_in_node->Name()}));
+        IR_NODE_LINK_TO(eltwise_y_in_node, conv);
+      }
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({bn_out->Name()}));
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
+           bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
+      IR_NODE_LINK_TO(conv, bn_out);
+      found_conv_bn_count++;
+    } else {  // fuse_option == FUSE_NATIVE
+      // create an elementwise add node.
+      OpDesc desc;
+      desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+      desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+      desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
+      desc.SetType("elementwise_add");
+      desc.SetAttr("axis", 1);
+      auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+           bn_variance_out, bn_saved_mean, bn_saved_variance});
+      IR_NODE_LINK_TO(conv_out, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_op, bn_out);
+      found_conv_bn_count++;
+    }
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bn_count);
+  return graph;
+}
+std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bn_pattern(conv_input, true /*with_eltwise_add*/);
+  int found_conv_bn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+    // conv, batch_norm,
+    // conv_weight, conv_out,
+    // bn_scale, bn_bias, bn_mean, bn_variance,
+    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance
+    GET_CONV_BN_NODES(conv_bn_pattern);
+    // OPERATORS
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bn_pattern);
+    // BIAS inputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_bn_pattern);
+    // BIAS outputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bn_pattern);
+    // Get eltwise_y (conv bias) variable
+    auto* eltwise_y_in_tensor =
+        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
+    // Get batch norm bias
+    auto* bn_bias_tensor =
+        scope->FindVar(bn_bias->Name())->GetMutable<LoDTensor>();
+    // update weights and biases
+    float epsilon = boost::get<float>(batch_norm->Op()->GetAttr("epsilon"));
+    recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor,
+                               *bn_mean, *bn_variance, eltwise_y_in_tensor,
+                               epsilon);
+    // Update the elementwise_add node
+    eltwise->Op()->SetAttr("axis", 1);
+    eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
+    GraphSafeRemoveNodes(
+        graph.get(),
+        {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+         bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
+    IR_NODE_LINK_TO(eltwise, bn_out);
+    found_conv_bn_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bn_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bn_fuse_pass, paddle::framework::ir::ConvBNFusePass);
+REGISTER_PASS(conv_eltwiseadd_bn_fuse_pass,
+              paddle::framework::ir::ConvEltwiseAddBNFusePass);
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
+ */
+class ConvBNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_bn_fuse"};
+};
+class ConvEltwiseAddBNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvEltwiseAddBNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include <functional>
+#include <utility>
+#include "paddle/fluid/framework/ir/graph_traits.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+// The function keeps the graph consistent by replacing
+// a node 'from' in the set of inputs nodes
+// of the visited node by a node 'to'.
+void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
+  for (auto& node : GraphTraits::DFS(*graph)) {
+    auto from_in_inputs =
+        std::find(std::begin(node.inputs), std::end(node.inputs), from);
+    if (from_in_inputs != std::end(node.inputs)) {
+      IR_NODE_LINK_TO(to, (&node));
+      auto inputs = node.Op()->Inputs();
+      using input_type = VariableNameMap::value_type;
+      std::for_each(std::begin(inputs), std::end(inputs),
+                    [from, to, &node](const input_type& i) -> void {
+                      auto param_names = i.second;
+                      auto pi = std::find(std::begin(param_names),
+                                          std::end(param_names), from->Name());
+                      if (pi != std::end(param_names)) {
+                        node.Op()->SetInput(i.first, {to->Name()});
+                      }
+                    });
+    }
+  }
+}
+}  // namespace
+using graph_ptr = std::unique_ptr<ir::Graph>;
+graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Conv conv_pattern{pattern, name_scope_};
+  auto conv_output = conv_pattern();
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  elementwise_add_pattern(conv_output);
+  conv_output->AsIntermediate();
+  auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> {
+    auto bias_input_names = conv_op.Op()->Inputs();
+    auto bias_it = bias_input_names.find("Bias");
+    if (bias_it != std::end(bias_input_names)) {
+      bool has_bias = !bias_it->second.empty();
+      if (has_bias) {
+        auto conv_bias_names = bias_it->second;
+        auto conv_bias_names_it =
+            std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
+                         [&conv_bias_names](Node* n) -> bool {
+                           return n->Name() == conv_bias_names[0];
+                         });
+        return std::make_pair(has_bias, *conv_bias_names_it);
+      }
+    }
+    return std::make_pair(false, nullptr);
+  };
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    OpDesc op_desc;
+    op_desc.SetType("conv2d");
+    op_desc.SetInput("Input", {conv_input->Name()});
+    op_desc.SetInput("Filter", {conv_filter->Name()});
+    op_desc.SetInput("ResidualData", {elementwise_add_x->Name()});
+    op_desc.SetOutput("Output", {conv_output->Name()});
+    bool has_bias;
+    Node* conv_bias;
+    std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op);
+    if (has_bias) {
+      op_desc.SetInput("Bias", {conv_bias->Name()});
+    }
+    for (const auto& attr : conv_op->Op()->GetAttrMap()) {
+      op_desc.SetAttr(attr.first, attr.second);
+    }
+    op_desc.SetAttr("fuse_residual_connection", true);
+    auto fused_conv_op = g->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(conv_input, fused_conv_op);
+    IR_NODE_LINK_TO(conv_filter, fused_conv_op);
+    IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
+    IR_NODE_LINK_TO(fused_conv_op, conv_output);
+    if (has_bias) {
+      IR_NODE_LINK_TO(conv_bias, fused_conv_op);
+    }
+    CorrectGraphEdges(g, elementwise_add_out, conv_output);
+    GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"residual_connections_fuse_pass"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+constexpr int nodes_removed = 3;
+constexpr int nodes_added = 1;
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::pair<std::string, std::string>>& inputs,
+           const std::pair<std::string, std::string>& output) {
+  auto op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", true);
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+  op->SetOutput(output.first, {output.second});
+}
+struct IsReachable {
+  using func = std::function<bool(const std::string&, const std::string&)>;
+  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
+    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
+                        const std::string& name) -> Node* {
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        if (name == node.Name()) {
+          return &node;
+        }
+      }
+      return nullptr;
+    };
+    return [&](std::string from, const std::string to) -> bool {
+      if (from == to) return true;
+      std::map<std::string, bool> visited;
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        visited[node.Name()] = false;
+      }
+      visited[from] = true;
+      std::list<std::string> queue;
+      queue.push_back(from);
+      while (!queue.empty()) {
+        auto cur = find_node(graph, queue.front());
+        queue.pop_front();
+        if (cur == nullptr) return false;
+        for (auto n : cur->outputs) {
+          if (n->Name() == to) return true;
+          if (!visited[n->Name()]) {
+            visited[n->Name()] = true;
+            queue.push_back(n->Name());
+          }
+        }
+      }
+      return false;
+    };
+  }
+};
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+  int conv_count = 0;
+  int elementwise_add_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      ++conv_count;
+    }
+    if (node->IsOp() && node->Op()->Type() == "elementwise_add") {
+      ++elementwise_add_count;
+    }
+  }
+  EXPECT_EQ(conv_count, 1);
+  EXPECT_EQ(elementwise_add_count, 0);
+}
+ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
+                             const std::vector<std::string>& persistent_vars) {
+  ProgramDesc prog;
+  auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
+    auto var = prog.MutableBlock(0)->Var(var_name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    return var;
+  };
+  for (const auto& v : transient_vars) {
+    add_var_to_prog(v);
+  }
+  for (const auto& v : persistent_vars) {
+    auto var = add_var_to_prog(v);
+    var->SetPersistable(true);
+  }
+  return prog;
+}
+}  // namespace
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
+  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
+  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -46,6 +46,12 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+    FuseOptions fuse_option = FindFuseOption(*conv, *relu);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+relu fuse";
+      return;
+    }
    // Transform Conv node into ConvReLU node.
    OpDesc* desc = conv->Op();
    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -20,17 +20,19 @@ namespace paddle {
 namespace framework {
 namespace ir {
-void SetOp(ProgramDesc* prog, const std::string& type,
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
  auto* op = prog->MutableBlock(0)->AppendOp();
  op->SetType(type);
  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("use_mkldnn", use_mkldnn);
+    op->SetAttr("name", name);
    op->SetInput("Input", {inputs[0]});
    op->SetInput("Filter", {inputs[1]});
    op->SetInput("Bias", {inputs[2]});
  } else if (type == "relu") {
+    op->SetAttr("use_mkldnn", use_mkldnn);
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
@@ -43,7 +45,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
 ProgramDesc BuildProgramDesc() {
  ProgramDesc prog;
  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
    auto* var = prog.MutableBlock(0)->Var(v);
    var->SetType(proto::VarType::SELECTED_ROWS);
    if (v == "weights" || v == "bias") {
@@ -51,14 +54,24 @@ ProgramDesc BuildProgramDesc() {
    }
  }
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+  SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+  SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
        std::vector<std::string>({"c"}));
-  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
+  // conv+relu, both with MKL-DNN
-        std::vector<std::string>({"f"}));
+  SetOp(&prog, "conv2d", "conv1",
-  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"g"}));
+        std::vector<std::string>({"f"}), true);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), true);
+  SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}));
+  // conv+relu, only one with MKL-DNN
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}));
  return prog;
 }
@@ -88,10 +101,16 @@ TEST(ConvReLUFusePass, basic) {
      auto* op = node->Op();
      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("fuse_relu"));
+      // check if only "conv1" convolution is fused
-      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
-      if (fuse_relu) {
+      if (op_name == "conv1") {
-        ++conv_relu_count;
+        ASSERT_TRUE(op->HasAttr("fuse_relu"));
+        bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+        if (fuse_relu) {
+          ++conv_relu_count;
+        }
+      } else if (op_name == "conv2") {
+        ASSERT_FALSE(op->HasAttr("fuse_relu"));
      }
    }
  }

--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void FusePassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+Scope* FusePassBase::param_scope() const {
+  PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+  return graph_->Get<framework::Scope*>(kParamScopeAttr);
+}
+void FusePassBase::AddStatis(int count_of_fused) const {
+  PADDLE_ENFORCE(graph_);
+  PADDLE_ENFORCE(!repr_.empty());
+  if (!graph_->Has(kFuseStatisAttr)) {
+    graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
+  }
+  auto& info =
+      graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
+  info[repr_] = count_of_fused;
+}
+FuseOptions FusePassBase::FindFuseOption(const Node& node1,
+                                         const Node& node2) const {
+#ifdef PADDLE_WITH_MKLDNN
+  bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node1.Op()->GetAttr("use_mkldnn"));
+  bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node2.Op()->GetAttr("use_mkldnn"));
+  if (node1_mkldnn && node2_mkldnn)
+    return FUSE_MKLDNN;
+  else if (!node1_mkldnn && !node2_mkldnn)
+    return FUSE_NATIVE;
+  else
+    return DO_NOT_FUSE;
+#else
+  return FUSE_NATIVE;
+#endif
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -25,32 +25,24 @@ namespace ir {
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+enum FuseOptions {
+  DO_NOT_FUSE,  // fusing will not be done
+  FUSE_NATIVE,  // fusing will be done without MKL-DNN
+  FUSE_MKLDNN   // fusing will be done with MKL-DNN
+};
 class FusePassBase : public Pass {
 public:
-  void Init(const std::string& repr, Graph* graph) const {
+  void Init(const std::string& repr, Graph* graph) const;
-    repr_ = repr;
+  Scope* param_scope() const;
-    graph_ = graph;
+  void AddStatis(int count_of_fused) const;
-  }
-  Scope* param_scope() const {
-    PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
-    return graph_->Get<framework::Scope*>(kParamScopeAttr);
-  }
-  void AddStatis(int count_of_fused) const {
-    PADDLE_ENFORCE(graph_);
-    PADDLE_ENFORCE(!repr_.empty());
-    if (!graph_->Has(kFuseStatisAttr)) {
-      graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
-    }
-    auto& info =
-        graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
-    info[repr_] = count_of_fused;
-  }
  virtual ~FusePassBase() {}
 protected:
+  virtual FuseOptions FindFuseOption(const Node& node1,
+                                     const Node& node2) const;
  mutable Graph* graph_;
  mutable std::string repr_;
 };

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -259,6 +259,8 @@ GraphPatternDetector::DetectPatterns() {
  return result;
 }
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
    std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
  if (subgraphs->empty()) return;
@@ -626,6 +628,112 @@ bool VarLinksFromOp(Node *node, const std::string &op_type) {
  return false;
 }
+PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
+                                     bool with_eltwise_add) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  PDNode *eltwise_op = nullptr;
+  if (with_eltwise_add) {
+    eltwise_op =
+        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  }
+  auto *batch_norm_op =
+      pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm");
+  // Create variables
+  // Conv Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d");
+  PDNode *eltwise_y_in_var = nullptr;
+  PDNode *eltwise_out_var = nullptr;
+  if (with_eltwise_add) {
+    // Conv output as Bias input
+    conv_out_var->assert_is_op_input("elementwise_add", "X");
+    // Bias
+    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
+                           ->assert_is_op_input("elementwise_add", "Y")
+                           ->AsInput();
+    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                          ->AsIntermediate()
+                          ->assert_is_only_output_of_op("elementwise_add");
+  } else {
+    // Conv output as BN input
+    conv_out_var->assert_is_op_input("batch_norm", "X");
+  }
+  // BN Scale
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->AsInput()
+                           ->assert_is_persistable_var()
+                           ->assert_is_op_input("batch_norm", "Scale");
+  // BN Bias
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("batch_norm", "Bias");
+  // BN Mean
+  auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("batch_norm", "Mean");
+  // BN Variance
+  auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("batch_norm", "Variance");
+  // BN output
+  auto *bn_out_var = pattern->NewNode(bn_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("batch_norm");
+  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("batch_norm", "MeanOut");
+  auto *bn_variance_out_var =
+      pattern->NewNode(bn_variance_out_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "VarianceOut");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "SavedMean");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("batch_norm", "SavedVariance");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  if (with_eltwise_add) {
+    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
+        .LinksTo({eltwise_out_var});
+    batch_norm_op
+        ->LinksFrom({eltwise_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
+                     bn_variance_var})
+        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
+                  bn_saved_mean_var, bn_saved_variance_var});
+  } else {
+    batch_norm_op
+        ->LinksFrom({conv_out_var, bn_scale_var, bn_bias_var, bn_mean_var,
+                     bn_variance_var})
+        .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var,
+                  bn_saved_mean_var, bn_saved_variance_var});
+  }
+  return bn_out_var;
+}
 PDNode *patterns::ConvReLU::operator()(
    paddle::framework::ir::PDNode *conv_input) {
  // Create Operators
@@ -653,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()(
  return relu_out_var;
 }
+PDNode *patterns::SeqConvEltAddRelu::operator()(
+    paddle::framework::ir::PDNode *seqconv_input) {
+  // Create Operators
+  seqconv_input->assert_is_op_input("sequence_conv", "X");
+  auto *seqconv_op = pattern->NewNode(seqconv_repr())
+                         ->assert_is_op("sequence_conv")
+                         ->assert_op_attr<bool>("paddingTrainable", false)
+                         ->assert_op_attr<int>("contextStride", 1);
+  auto *eltadd_op =
+      pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add");
+  auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto *seqconv_weight_var =
+      pattern->NewNode(seqconv_weight_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("sequence_conv", "Filter");
+  // Bias
+  auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr())
+                              ->AsIntermediate()
+                              ->assert_is_only_output_of_op("sequence_conv")
+                              ->assert_is_op_input("elementwise_add");
+  auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr())
+                             ->AsIntermediate()
+                             ->assert_is_only_output_of_op("elementwise_add")
+                             ->assert_is_only_input_of_op("relu");
+  // output
+  auto *relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+  seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var})
+      .LinksTo({seqconv_out_var});
+  eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var})
+      .LinksTo({eltadd_out_var});
+  relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
 PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
                                 bool with_bias) {
  // Create shared nodes.
@@ -858,6 +1011,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
  return ele_add_grad;
 }
+PDNode *patterns::ConvBias::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *eltiwse_op =
+      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  // Create variables
+  // Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("elementwise_add");
+  // Bias stored in elementwise_add
+  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
+                               ->AsInput()
+                               ->assert_is_persistable_var()
+                               ->assert_is_op_input("elementwise_add", "Y");
+  // output
+  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("elementwise_add");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
+      .LinksTo({eltwise_out_var});
+  return eltwise_out_var;
+}
+PDNode *patterns::Conv::operator()() {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+  conv_op->LinksFrom({input_var, filter_var});
+  conv_op->LinksTo({output_var});
+  return output_var;
+}
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  x_var->assert_is_op_input("elementwise_add", "X");
+  auto y_var = pattern->NewNode(elementwise_add_x_repr())
+                   ->AsInput()
+                   ->assert_is_op_input("elementwise_add", "Y");
+  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+                     ->AsOutput()
+                     ->assert_is_op_output("elementwise_add", "Out");
+  elementwise_add_op->LinksFrom({x_var, y_var});
+  elementwise_add_op->LinksTo({out_var});
+  return out_var;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -128,6 +128,15 @@ struct PDNode {
      const std::unordered_set<std::string>& op_types,
      const std::string& argument, int nth);
+  template <typename T>
+  PDNode* assert_op_attr(const std::string& attr_name, const T& attr) {
+    asserts_.emplace_back([=](Node* x) {
+      return x && x->IsOp() && x->Op()->HasAttr(attr_name) &&
+             boost::get<T>(x->Op()->GetAttr(attr_name)) == attr;
+    });
+    return this;
+  }
 private:
  PDNode(PDPattern* pattern, const std::string& name = "",
         Type type = Type::kVar)
@@ -375,6 +384,44 @@ struct PatternBase {
  size_t id_;
 };
+// Conv with batch norm
+// op: conv + (elementwise_add +) batch_norm
+// named nodes:
+// conv_weight, conv_out, conv,
+// bn_x, bn_scale, bn_bias, bn_mean,  bn_variance,
+// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out,
+// bn_saved_mean, bn_saved_variance
+struct ConvBN : public PatternBase {
+  ConvBN(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bn") {}
+  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(batch_norm);
+  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
+  // CONV inputs
+  PATTERN_DECL_NODE(conv_weight);  // Filter
+  // CONV outputs
+  PATTERN_DECL_NODE(conv_out);  // tmp
+  // ELTWISE inputs
+  PATTERN_DECL_NODE(eltwise_y_in);
+  // ELTWISE outputs
+  PATTERN_DECL_NODE(eltwise_out);  // tmp
+  // BN inputs
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_mean);
+  PATTERN_DECL_NODE(bn_variance);
+  // BN outputs
+  PATTERN_DECL_NODE(bn_out);  // Out
+  PATTERN_DECL_NODE(bn_mean_out);
+  PATTERN_DECL_NODE(bn_variance_out);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_saved_variance);
+};
 // CONV with ReLU
 // op: conv + relu
 // named nodes:
@@ -396,6 +443,31 @@ struct ConvReLU : public PatternBase {
  PATTERN_DECL_NODE(relu_out);
 };
+// SEQCONV with Elementwise_Add ReLU
+// op: seqconv + elementwise_add + relu
+// named nodes:
+// seqconv_input, seqconv_weight,
+// seqconv_out, seqconv,
+// elementwise_add_bias, elementwise_add_out, elementwise_add
+// relu_out, relu
+struct SeqConvEltAddRelu : public PatternBase {
+  SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {}
+  PDNode* operator()(PDNode* seqconv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(seqconv);
+  PATTERN_DECL_NODE(eltadd);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(seqconv_weight);
+  PATTERN_DECL_NODE(seqconv_out);
+  PATTERN_DECL_NODE(eltadd_bias);
+  PATTERN_DECL_NODE(eltadd_out);
+  PATTERN_DECL_NODE(relu_out);
+};
 // FC with bias
 // op: mul + elementwise_add
 // named nodes:
@@ -540,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
  PATTERN_DECL_NODE(d_ele_y);
  PATTERN_DECL_NODE(ele_y);
 };
+// Conv with Elementwise_add as bias
+// op: conv + elementwise_add
+// named nodes:
+// conv_input, conv_weight,
+// conv_out, conv,
+// eltwise_bias, eltwise_out,
+// elementwise_add
+struct ConvBias : public PatternBase {
+  ConvBias(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bias") {}
+  PDNode* operator()(PDNode* conv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(eltwise);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(eltwise_bias);
+  PATTERN_DECL_NODE(eltwise_out);
+};
+// Convolution op
+// Forward pass for convolution.
+// conv_input, conv_bias and conv_filter are inputs.
+// conv_output is a result of the operator.
+// residual_data is data used by skip connection.
+// If residual connection fusion is on, the formula is:
+// conv_output = conv_op(conv_filter, conv_input, conv_bias)
+//             + conv_residual_data
+// If the fusion is off, conv_residual_data is not added.
+struct Conv : public PatternBase {
+  Conv(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "convolution") {}
+  PDNode* operator()();
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+// ElementwiseAdd used in residual connections.
+// y_var is used and convolution output.
+// The operator is removed, when residual
+// connection fusion is on.
+struct ElementwiseAdd : public PatternBase {
+  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add") {}
+  PDNode* operator()(PDNode* x_var);
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_x);
+  PATTERN_DECL_NODE(elementwise_add_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+      n->Op()->SetAttr("use_mkldnn", true);
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(mkldnn_placement_pass,
+              paddle::framework::ir::MKLDNNPlacementPass);
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class MKLDNNPlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X"))
+                  ->assert_is_op_input("sequence_conv")
+                  ->assert_var_not_persistable();
+  patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope);
+  fuse_pattern(x);
+  // Create New OpDesc
+  auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight,
+                          Node* eltadd_bias, Node* relu_out) {
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqconv_eltadd_relu");
+    op_desc.SetInput("X", {input->Name()});
+    op_desc.SetInput("Filter", {seqconv_weight->Name()});
+    op_desc.SetInput("Bias", {eltadd_bias->Name()});
+    op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength"));
+    op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
+    op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
+    op_desc.SetOutput("ColMat", {ColMat});
+    op_desc.SetOutput("Out", {relu_out->Name()});
+    scope->Var(ColMat)->GetMutable<LoDTensor>();
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(seqconv_weight, op);
+    IR_NODE_LINK_TO(eltadd_bias, op);
+    IR_NODE_LINK_TO(op, relu_out);
+    return op;
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern);
+    fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias,
+                 relu_out);
+    std::unordered_set<const Node*> marked_nodes(
+        {seqconv, seqconv_out, eltadd, eltadd_out, relu});
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+  return fusion_count;
+}
+std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope());
+  AddStatis(fusion_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
+              paddle::framework::ir::SeqConvEltAddReluFusePass);
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class SeqConvEltAddReluFusePass : public FusePassBase {
+ public:
+  virtual ~SeqConvEltAddReluFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -85,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
    out_var->SetLoDLevel(in_var->GetLoDLevel());
  }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -100,16 +100,6 @@ class OpDesc {
  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
  const VariableNameMap &Inputs() const { return inputs_; }
  const VariableNameMap &Outputs() const { return outputs_; }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    platform::SetDeviceId(dev_id);
 #endif
  }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
+  // The profile has a process-wide mutex, results in serious performance issue
-  RunImpl(scope, place);
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
+  }
  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -299,6 +299,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 ParallelExecutor::~ParallelExecutor() {
+  const auto dev_ctxs =
+      platform::DeviceContextPool::Instance().GetAllDeviceContexts();
+  for (auto &dev_ctx : dev_ctxs) {
+    dev_ctx->Wait();
+  }
  if (member_->own_local_scope_) {
    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
      Scope *local_scope = member_->local_scopes_[i];
@@ -307,6 +313,10 @@ ParallelExecutor::~ParallelExecutor() {
      }
    }
  }
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 }  // namespace framework

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 #ifdef PADDLE_WITH_CUDA
  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -126,7 +126,7 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
  std::vector<std::string> feed_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
      if (col >= feed_target_names.size()) {
        feed_target_names.resize(col + 1);
      }
@@ -143,7 +143,7 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
  std::vector<std::string> fetch_target_names;
  for (auto *op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
      if (col >= fetch_target_names.size()) {
        fetch_target_names.resize(col + 1);
      }

--- a/paddle/fluid/framework/reader_test.cc
+++ b/paddle/fluid/framework/reader_test.cc
@@ -39,7 +39,7 @@ TEST(READER, decorate_chain) {
  {
    auto endpoints = root->GetEndPoints();
    ASSERT_EQ(endpoints.size(), 2U);
-    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
    ASSERT_NE(endpoints.count(end_point2.get()), 0);
  }

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
  /// Caller doesn't own the returned Variable.
  Variable* FindVar(const std::string& name) const;
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
  const Scope* parent() const { return parent_; }
  /// Find the scope or an ancestor scope that contains the given variable.

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) {
  ASSERT_TRUE(table.HasKey(10));
  ASSERT_TRUE(table.HasKey(8));
  ASSERT_TRUE(table.HasKey(6));
-  ASSERT_EQ(table.rows().size(), 3);
+  ASSERT_EQ(table.rows().size(), 3UL);
  framework::Tensor ids;
  ids.Resize(framework::make_ddim({4}));

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -37,6 +37,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
  }
@@ -72,6 +77,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                   stream);
    } else {
@@ -115,6 +125,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  auto dst_ptr = dst->mutable_data(dst_place, src.type());
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
  }
@@ -131,6 +146,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
  }
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
  Tensor slice_tensor = src_tensor.Slice(1, 2);
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
    }
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
    Tensor slice_tensor = src_tensor.Slice(1, 2);
    // CPU Slice Tensor to GPU Tensor

--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -59,6 +59,7 @@ class VarDesc {
 public:
  explicit VarDesc(const std::string &name) {
    desc_.set_name(name);
+    // TODO(paddle-dev): Why default to lodtensor.
    desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
  }

--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
  template <typename T>
  T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
      holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
    }
    return static_cast<T*>(holder_->Ptr());
  }

--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
  const Tensor& tt = v->Get<Tensor>();
  EXPECT_EQ(1234, tt.content_);
-  std::string* s = v->GetMutable<std::string>();
+  try {
-  *s = "hello";
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
-  const std::string& ss = v->Get<std::string>();
+    return;
-  EXPECT_EQ("hello", ss);
+  }
+  EXPECT_TRUE(false);
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -19,9 +19,19 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 add_subdirectory(api)
+set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
+set(SHARED_INFERENCE_SRCS
+    io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
+if (WITH_GPU AND TENSORRT_FOUND)
+  set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine)
+  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc)
+endif()
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api 
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
-           analysis_predictor zero_copy_tensor)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -29,10 +39,7 @@ if(NOT APPLE)
 endif()
 # Create shared library
-cc_library(paddle_fluid_shared SHARED
+cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
    DEPS ${fluid_modules} paddle_fluid_api)
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-             "depthwise_conv2d", "batch_norm", "concat", "tanh",
+             "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
             "elementwise_add", "dropout"});
        if (!node->IsFunction()) return false;
@@ -101,7 +101,13 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 void Analyzer::Run(Argument* argument) {
  std::vector<std::string> passes;
-  for (auto& pass : all_ir_passes_) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (use_mkldnn_) {
+    VLOG(3) << "Adding MKL-DNN placement pass";
+    passes.push_back("mkldnn_placement_pass");
+  }
+#endif
+  for (auto& pass : ir_passes_) {
    if (!disabled_ir_passes_.count(pass)) {
      passes.push_back(pass);
      passes.push_back("graph_viz_pass");  // add graphviz for debug.
@@ -117,11 +123,26 @@ void Analyzer::Run(Argument* argument) {
  }
 }
+Analyzer& Analyzer::IncludeAllIrPasses() {
+  ir_passes_ = all_ir_passes_;
+  return *this;
+}
 Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
  disabled_ir_passes_.insert(passes.begin(), passes.end());
  return *this;
 }
+Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
+  ir_passes_ = passes;
+  return *this;
+}
+Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
+  use_mkldnn_ = use_mkldnn;
+  return *this;
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
  void Run(Argument* argument);
  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeAllIrPasses();
+  Analyzer& SetUseMkldnn(bool use_mkldnn);
  DISABLE_COPY_AND_ASSIGN(Analyzer);
@@ -64,21 +67,29 @@ class Analyzer : public OrderedRegistry<PassManager> {
  // larger fusion.
  const std::vector<std::string> all_ir_passes_{{
      // Manual update the passes here.
-      "infer_clean_graph_pass",       //
+      "infer_clean_graph_pass",         //
-      "attention_lstm_fuse_pass",     //
+      "attention_lstm_fuse_pass",       //
-      "embedding_fc_lstm_fuse_pass",  //
+      "seqconv_eltadd_relu_fuse_pass",  //
-      "fc_lstm_fuse_pass",            //
+      "embedding_fc_lstm_fuse_pass",    //
-      "mul_lstm_fuse_pass",           //
+      "fc_lstm_fuse_pass",              //
-      "fc_gru_fuse_pass",             //
+      "mul_lstm_fuse_pass",             //
-      "mul_gru_fuse_pass",            //
+      "fc_gru_fuse_pass",               //
-      "seq_concat_fc_fuse_pass",      //
+      "mul_gru_fuse_pass",              //
-      "fc_fuse_pass",                 //
+      "seq_concat_fc_fuse_pass",        //
+      "fc_fuse_pass",                   //
+      "conv_bn_fuse_pass",              //
+      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
-      "conv_relu_mkldnn_fuse_pass",  //
+      "conv_bias_mkldnn_fuse_pass",             //
+      "conv_relu_mkldnn_fuse_pass",             //
+      "conv_elementwise_add_mkldnn_fuse_pass",  //
 #endif
  }};
  std::unordered_set<std::string> disabled_ir_passes_;
+  // Ir passes to run
+  std::vector<std::string> ir_passes_;
+  bool use_mkldnn_;
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
  config.model_dir = model_path;
  config.use_gpu = false;
  config.device = 0;
-  auto predictor =
+  auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);
-      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-          config);
  // One single batch

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -25,9 +25,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
  }
 #endif
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
@@ -72,10 +77,6 @@ bool AnalysisPredictor::Init(
    inference_program_ = program;
  }
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
-  }
  executor_->Prepare(scope_.get(), *inference_program_, 0,
                     config_.use_feed_fetch_ops);
@@ -220,10 +221,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
-  PADDLE_ENFORCE(
-      config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
+  switch (config_.ir_mode) {
-      "Only kExclude is supported yet.");
+    case contrib::AnalysisConfig::IrPassMode::kExclude:
-  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+      Analyzer()
+          .IncludeAllIrPasses()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .DisableIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    case contrib::AnalysisConfig::IrPassMode::kInclude:
+      Analyzer()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .IncludeIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    default:
+      LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
+  }
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
@@ -335,6 +350,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
  }
  return true;
 }
+AnalysisPredictor::~AnalysisPredictor() {
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
+#endif
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+}
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  auto *x = new AnalysisPredictor(config_);
  x->Init(scope_, inference_program_);

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
  template <typename T>
  void GetFetchOne(const framework::LoDTensor &fetchs,
                   PaddleTensor *output_data);
+  ~AnalysisPredictor();
 private:
  contrib::AnalysisConfig config_;

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  config.use_feed_fetch_ops = false;
-  auto predictor =
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
  auto w0 = predictor->GetInputTensor("firstw");
  auto w1 = predictor->GetInputTensor("secondw");

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -23,9 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace {
@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
  }
 #endif
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -205,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
+        EXPECT_NEAR(ref_data[i], data[i], 2e-3);
      }
    });
  }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
+USE_TRT_CONVERTER(pad);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  config1.device = 0;
  config1.max_batch_size = 10;
-  auto predictor0 =
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -3,6 +3,7 @@ project(cpp_inference_demo CXX C)
 option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
 option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
 option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 macro(safe_set_static_flag)
    foreach(flag_var
@@ -60,6 +61,13 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
+if (NOT WIN32) 
+  if (USE_TENSORRT AND WITH_GPU) 
+      include_directories("${TENSORRT_INCLUDE_DIR}")
+      link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
 if (NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
 link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
@@ -69,7 +77,7 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/paddle/fluid/inference")
+link_directories("${PADDLE_LIB}/paddle/lib")
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
@@ -89,10 +97,10 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 if (NOT WIN32)
@@ -112,6 +120,10 @@ endif(NOT WIN32)
 if(WITH_GPU)
  if(NOT WIN32)
+    if (USE_TENSORRT) 
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
  else()
    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -3,11 +3,15 @@ PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
+TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include
+TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
 cd `dirname $0`
 current_dir=`pwd`
 if [ $2 == ON ]; then
  # You can export yourself if move the install path
-  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
 fi
 if [ $3 == ON ]; then
@@ -16,6 +20,11 @@ else
  use_gpu_list='false'
 fi
+USE_TENSORRT=OFF
+if [ [-d"$TENSORRT_INCLUDE_DIR"] -a [-d"$TENSORRT_LIB_DIR"] ]; then
+  USE_TENSORRT=ON
+fi
 PREFIX=inference-vis-demos%2F
 URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
@@ -47,7 +56,7 @@ cd build
 for WITH_STATIC_LIB in ON OFF; do
  # -----simple_on_word2vec-----
  rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
    -DWITH_MKL=$TURN_ON_MKL \
    -DDEMO_NAME=simple_on_word2vec \
    -DWITH_GPU=$TEST_GPU_CPU \
@@ -67,7 +76,7 @@ for WITH_STATIC_LIB in ON OFF; do
  fi
  # ---------vis_demo---------
  rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
    -DWITH_MKL=$TURN_ON_MKL \
    -DDEMO_NAME=vis_demo \
    -DWITH_GPU=$TEST_GPU_CPU \
@@ -86,5 +95,23 @@ for WITH_STATIC_LIB in ON OFF; do
      fi
    done
  done
+  # --------tensorrt mobilenet------
+  if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
+    rm -rf *
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=trt_mobilenet_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DUSE_TENSORRT=$USE_TENSORRT \
+      -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
+      -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
+    make -j 
+    ./trt_mobilenet_demo \
+      --modeldir=$DATA_DIR/mobilenet/model \
+      --data=$DATA_DIR/mobilenet/data.txt \
+      --refer=$DATA_DIR/mobilenet/result.txt 
+  fi
 done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -42,8 +42,7 @@ void Main(bool use_gpu) {
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
-  auto predictor =
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  for (int batch_id = 0; batch_id < 3; batch_id++) {
    //# 2. Prepare input.
@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) {
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
-  auto main_predictor =
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  std::vector<std::thread> threads;
  for (int tid = 0; tid < num_threads; ++tid) {

--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include "utils.h"  // NOLINT
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data, "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+namespace paddle {
+namespace demo {
+/*
+ * Use the tensorrt fluid engine to inference the demo.
+ */
+void Main() {
+  std::unique_ptr<PaddlePredictor> predictor;
+  paddle::contrib::MixedRTConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = true;
+  config.device = 0;
+  config.max_batch_size = 1;
+  config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
+  VLOG(3) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+  VLOG(3) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output, 1);
+  VLOG(3) << "output.size " << output.size();
+  auto& tensor = output.front();
+  VLOG(3) << "output: " << SummaryTensor(tensor);
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+}  // namespace demo
+}  // namespace paddle
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -14,13 +14,20 @@
 #pragma once
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 namespace paddle {
 namespace demo {
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
 static void split(const std::string& str, char sep,
                  std::vector<std::string>* pieces) {
  pieces->clear();
@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep,
  }
 }
+Record ProcessALine(const std::string& line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  CHECK_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
+      }
+      break;
+  }
+}
 /*
 * Get a summary of a PaddleTensor content.
 */

--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,11 +18,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include <fstream>
+#include "utils.h"  // NOLINT
-#include <iostream>
-// #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/inference/demo_ci/utils.h"
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -38,70 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
-struct Record {
+using contrib::AnalysisConfig;
-  std::vector<float> data;
-  std::vector<int32_t> shape;
-};
-void split(const std::string& str, char sep, std::vector<std::string>* pieces);
-Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-  CHECK_EQ(columns.size(), 2UL)
-      << "data format error, should be <data>\t<shape>";
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto& d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto& s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
-  return record;
-}
-void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
-  std::string line;
-  std::ifstream file(referfile);
-  std::getline(file, line);
-  auto refer = ProcessALine(line);
-  file.close();
-  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
-  CHECK_EQ(numel, refer.data.size());
-  switch (output.dtype) {
-    case PaddleDType::INT64: {
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
-      }
-      break;
-    }
-    case PaddleDType::FLOAT32:
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_LT(
-            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
-            1e-5);
-      }
-      break;
-  }
-}
 /*
- * Use the native fluid engine to inference the demo.
+ * Use the native and analysis fluid engine to inference the demo.
 */
 void Main(bool use_gpu) {
-  NativeConfig config;
+  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
+  AnalysisConfig config;
  config.param_file = FLAGS_modeldir + "/__params__";
  config.prog_file = FLAGS_modeldir + "/__model__";
  config.use_gpu = use_gpu;
@@ -111,8 +50,8 @@ void Main(bool use_gpu) {
  }
  VLOG(3) << "init predictor";
-  auto predictor =
+  predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  VLOG(3) << "begin to process data";
  // Just a single batch of data.
@@ -130,8 +69,8 @@ void Main(bool use_gpu) {
  input.dtype = PaddleDType::FLOAT32;
  VLOG(3) << "run executor";
-  std::vector<PaddleTensor> output;
+  std::vector<PaddleTensor> output, analysis_output;
-  predictor->Run({input}, &output);
+  predictor->Run({input}, &output, 1);
  VLOG(3) << "output.size " << output.size();
  auto& tensor = output.front();
@@ -139,6 +78,10 @@ void Main(bool use_gpu) {
  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);
+  // the analysis_output has some diff with native_output,
+  // TODO(luotao): add CheckOutput for analysis_output later.
+  analysis_predictor->Run({input}, &analysis_output, 1);
 }
 }  // namespace demo
@@ -146,9 +89,10 @@ void Main(bool use_gpu) {
 int main(int argc, char** argv) {
  google::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::demo::Main(false /* use_gpu*/);
  if (FLAGS_use_gpu) {
    paddle::demo::Main(true /*use_gpu*/);
+  } else {
+    paddle::demo::Main(false /*use_gpu*/);
  }
  return 0;
 }
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -259,10 +259,17 @@ struct AnalysisConfig : public NativeConfig {
    kExclude   // Specify the disabled passes in `ir_passes`.
  };
+  void SetIncludeMode() {
+    ir_mode = IrPassMode::kInclude;
+    // this pass has to be run at the beginning of all fuse passes
+    ir_passes = {"infer_clean_graph_pass"};
+  }
  // Determine whether to perform graph optimization.
  bool enable_ir_optim = true;
  // Manually determine the IR passes to run.
  IrPassMode ir_mode{IrPassMode::kExclude};
+  // passes to be excluded/included
  std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
  // NOT stable yet.

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
  DEPS tensorrt_engine operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * PadOp.
+ */
+class PadOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+    const float pad_value = boost::get<float>(op_desc.GetAttr("pad_value"));
+    nvinfer1::Dims input_shape = input->getDimensions();
+    int nbDims = input_shape.nbDims;
+    int pad_size = static_cast<int>(paddings.size());
+    PADDLE_ENFORCE_GE(nbDims, 2);
+    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
+    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+    nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
+    nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding,
+                                       *const_cast<nvinfer1::ITensor*>(input),
+                                       pre_pad, post_pad);
+    PADDLE_ENFORCE(layer != nullptr);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer->setName(("scale (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+TEST(PadConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2));
+  validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5));
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pad");
+  desc.SetInput("X", {"pad-X"});
+  desc.SetOutput("Out", {"pad-Out"});
+  std::vector<int> paddings = {0, 0, 0, 0, 0, 1, 1, 2};
+  float pad_value = 0.0;
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("pad_value", pad_value);
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(2);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(pad);
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -52,9 +52,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 // Easy for profiling independently.
-TEST(Analyzer_resnet50, profile) {
+void profile(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -69,6 +70,11 @@ TEST(Analyzer_resnet50, profile) {
  }
 }
+TEST(Analyzer_resnet50, profile) { profile(); }
+#ifndef PADDLE_WITH_MKLDNN
+TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
 // Check the fuse status
 TEST(Analyzer_resnet50, fuse_statis) {
  AnalysisConfig cfg;
@@ -82,15 +88,21 @@ TEST(Analyzer_resnet50, fuse_statis) {
 }
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_resnet50, compare) {
+void compare(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  CompareNativeAndAnalysis(cfg, input_slots_all);
 }
+TEST(Analyzer_resnet50, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -308,18 +308,13 @@ TEST(Analyzer_rnn1, ZeroCopy) {
  PaddlePlace place;
  int output_size{0};
-  auto predictor =
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
  config.use_feed_fetch_ops = true;
-  auto native_predictor =
+  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
-  auto analysis_predictor =
+  auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
 #define NEW_TENSOR(name__) \
  auto name__##_tensor = predictor->GetInputTensor(#name__);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -18,12 +18,12 @@ namespace paddle {
 namespace inference {
 using namespace framework;  // NOLINT
+static std::vector<float> result_data;
 struct DataRecord {
  std::vector<std::vector<std::vector<float>>> link_step_data_all;
  std::vector<size_t> lod;
  std::vector<std::vector<float>> rnn_link_data;
-  std::vector<float> result_data;
  size_t num_samples;  // total number of samples
  size_t batch_iter{0};
  size_t batch_size{1};
@@ -57,6 +57,7 @@ struct DataRecord {
    std::ifstream file(path);
    std::string line;
    int num_lines = 0;
+    result_data.clear();
    while (std::getline(file, line)) {
      num_lines++;
      std::vector<std::string> data;
@@ -135,13 +136,12 @@ TEST(Analyzer_rnn2, profile) {
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
-    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
    PADDLE_ENFORCE_GT(outputs.size(), 0);
    size_t size = GetSize(outputs[0]);
    PADDLE_ENFORCE_GT(size, 0);
    float *result = static_cast<float *>(outputs[0].data.data());
    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
+      EXPECT_NEAR(result[i], result_data[i], 1e-3);
    }
  }
 }

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  GetFuseStatis(predictor.get(), &num_ops);
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6);
+  EXPECT_EQ(num_ops, 32);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->specify_input_name = true;
  // TODO(TJ): fix fusion gru
  cfg->ir_passes.push_back("fc_gru_fuse_pass");
-#ifdef PADDLE_WITH_MKLDNN
-  cfg->_use_mkldnn = true;
-#endif
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -84,9 +81,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 // Easy for profiling independently.
 //  ocr, mobilenet and se_resnext50
-TEST(Analyzer_vis, profile) {
+void profile(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -108,6 +106,12 @@ TEST(Analyzer_vis, profile) {
  }
 }
+TEST(Analyzer_vis, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
 // Check the fuse status
 TEST(Analyzer_vis, fuse_statis) {
  AnalysisConfig cfg;
@@ -118,15 +122,21 @@ TEST(Analyzer_vis, fuse_statis) {
 }
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_vis, compare) {
+void compare(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  CompareNativeAndAnalysis(cfg, input_slots_all);
 }
+TEST(Analyzer_vis, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -77,11 +77,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
    const AnalysisConfig &config, bool use_analysis = true) {
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig,
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
-                                 PaddleEngineKind::kAnalysis>(config);
  } else {
-    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    return CreatePaddlePredictor<NativeConfig>(config);
-        config);
  }
 }
@@ -165,7 +163,8 @@ void TestPrediction(const AnalysisConfig &config,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
                    std::vector<PaddleTensor> *outputs, int num_threads,
                    bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis;
+  LOG(INFO) << "use_analysis: " << use_analysis
+            << ", use_mkldnn: " << config._use_mkldnn;
  if (num_threads == 1) {
    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
  } else {
@@ -177,6 +176,7 @@ void TestPrediction(const AnalysisConfig &config,
 void CompareNativeAndAnalysis(
    const AnalysisConfig &config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  LOG(INFO) << "use_mkldnn: " << config._use_mkldnn;
  std::vector<PaddleTensor> native_outputs, analysis_outputs;
  TestOneThreadPrediction(config, inputs, &native_outputs, false);
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
  config1.model_dir = model_dirname;
  config1.max_batch_size = batch_size;
-  auto predictor0 =
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
  // Prepare inputs
  int height = 224;
  int width = 224;

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,7 +86,7 @@ function(op_library TARGET)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-     "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
@@ -230,7 +230,7 @@ if(WITH_DISTRIBUTE)
        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    endforeach()
    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
    #        listen_and_serv_op sum_op executor SERIAL)
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
+op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
@@ -299,11 +300,12 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
-op_library(fusion_lstm_op DEPS cpu_lstm_compute)
+op_library(fusion_lstm_op DEPS jit_kernel)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
    op_library(reduce_mean_op DEPS cub)
+    op_library(affine_channel_op DEPS cub)
 else()
    op_library(conv_op DEPS vol2col im2col)
 endif()

--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 class AdadeltaOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of AdadeltaOp should not be null.");
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type =

--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto avg_squared_grad_out_tensor =
        ctx.Output<framework::Tensor>("AvgSquaredGradOut");

--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -21,25 +22,31 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const DeviceContext& context,
+  void operator()(const DeviceContext &context,
-                  const framework::SelectedRows& grad,
+                  const framework::SelectedRows &grad,
-                  const framework::Tensor& learning_rate, T epsilon,
+                  const framework::Tensor &learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param);
+                  framework::Tensor *moment, framework::Tensor *param);
 };
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *param_var = ctx.InputVar("Param");
-    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
    param_out_tensor->mutable_data<T>(ctx.GetPlace());
    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto* grad_var = ctx.InputVar("Grad");
+    auto *grad_var = ctx.InputVar("Grad");
    if (grad_var->IsType<framework::LoDTensor>()) {
      auto param = framework::EigenVector<T>::Flatten(
          *ctx.Input<framework::Tensor>("Param"));
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
          *ctx.Input<framework::Tensor>("Grad"));
      auto moment = framework::EigenVector<T>::Flatten(
          *ctx.Input<framework::Tensor>("Moment"));
-      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
      moment_out.device(*place) = moment + grad * grad;
      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
      if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto* lr = learning_rate->data<T>();
+        auto *lr = learning_rate->data<T>();
        param_out.device(*place) =
            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
      } else {
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
      }
    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
-      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
      SparseAdagradFunctor<DeviceContext, T> functor;

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
        row_numel_(row_numel),
        row_count_(row_count) {}
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
  inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
+    auto row_idx =
-    auto row_idx = BinarySearchInRows(row);
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
    // The following code is the same as dense
@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
    using paddle::framework::LoDTensor;
    using paddle::operators::detail::Ref;

--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
                   "Input(LearningRate) of AdamaxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of AdamaxOp should not be null.");

--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");

--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) Feature map input can be a 4D tensor with order NCHW "
+             "or NHWC. It also can be a 2D tensor and C is the second "
+             "dimension.");
+    AddInput("Scale",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the scale factor of the affine transformation "
+             "for the c-th channel of the input.");
+    AddInput("Bias",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the bias of the affine transformation for the "
+             "c-th channel of the input.");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddOutput("Out", "(Tensor) A tensor of the same shape and order with X.");
+    AddComment(R"DOC(
+Applies a separate affine transformation to each channel of the input. Useful
+for replacing spatial batch norm with its equivalent fixed transformation.
+The input also can be 2D tensor and applies a affine transformation in second
+dimension.
+$$Out = Scale*X + Bias$$
+)DOC");
+  }
+};
+class AffineChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AffineChannelOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
+  }
+};
+class AffineChannelOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                     "Input(Scale) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("X"),
+                        ctx->GetInputDim(framework::GradVarName("Out")));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      // Scale@GRAD and Bias@GRAD must exist at the same time.
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Scale"));
+    }
+  }
+};
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename DeviceContext, typename T>
+class AffineChannelKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+    auto* scale_d = scale->data<T>();
+    auto* bias_d = bias->data<T>();
+    ConstEigenVectorArrayMap<T> a_e(scale_d, C);
+    ConstEigenVectorArrayMap<T> b_e(bias_d, C);
+    auto* x_d = x->data<T>();
+    auto* y_d = y->data<T>();
+    if (layout == framework::DataLayout::kNCHW) {
+      int stride = C * HxW;
+      for (int i = 0; i < N; i++) {
+        ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+        EigenArrayMap<T> y_e(y_d, HxW, C);
+        y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose();
+        x_d += stride;
+        y_d += stride;
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> x_e(x_d, C, num);
+      EigenArrayMap<T> y_e(y_d, C, num);
+      y_e = (x_e.colwise() * a_e).colwise() + b_e;
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class AffineChannelGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+    auto* x_d = x->data<T>();
+    auto* dy_d = dy->data<T>();
+    auto* scale_d = scale->data<T>();
+    ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dscale_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dbias_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    EigenVectorArrayMap<T> dscale_e(dscale_d, C);
+    EigenVectorArrayMap<T> dbias_e(dbias_d, C);
+    if (layout == framework::DataLayout::kNCHW) {
+      // compute dx
+      int stride = C * HxW;
+      if (dx) {
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          EigenArrayMap<T> dx_e(dx_d, HxW, C);
+          dx_e = dy_e.rowwise() * scale_e.transpose();
+          dy_d += stride;
+          dx_d += stride;
+        }
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        dy_d = dy->data<T>();
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          if (i == 0) {
+            dscale_e = (x_e * dy_e).colwise().sum();
+          } else {
+            dscale_e += (x_e * dy_e).colwise().sum();
+          }
+          if (i == 0) {
+            dbias_e = dy_e.colwise().sum();
+          } else {
+            dbias_e += dy_e.colwise().sum();
+          }
+          x_d += stride;
+          dy_d += stride;
+        }
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> dy_e(dy_d, C, num);
+      // compute dx
+      if (dx) {
+        EigenArrayMap<T> dx_e(dx_d, C, num);
+        dx_e = dy_e.colwise() * scale_e;
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        ConstEigenArrayMap<T> x_e(x_d, C, num);
+        dscale_e = (x_e * dy_e).rowwise().sum();
+        dbias_e = dy_e.rowwise().sum();
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
+                  ops::AffineChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
+REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
+                       ops::AffineChannelKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(affine_channel_grad,
+                       ops::AffineChannelGradKernel<CPU, float>,
+                       ops::AffineChannelGradKernel<CPU, double>);
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+namespace paddle {
+namespace operators {
+template <typename T, framework::DataLayout layout, bool HasBias>
+__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias,
+                                    const int C, const int HxW, const int num,
+                                    T* y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    if (HasBias) {
+      y[i] = scale[c] * x[i] + bias[c];
+    } else {
+      y[i] = scale[c] * x[i];
+    }
+  }
+}
+template <typename DeviceContext, typename T>
+class AffineChannelCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+    const T* x_d = x->data<T>();
+    const T* scale_d = scale->data<T>();
+    const T* bias_d = bias->data<T>();
+    T* y_d = y->data<T>();
+    int block = 1024;
+    int grid = (num + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    } else {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    }
+  }
+};
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void AffineChannelScaleBiasGradientCUDAKernel(
+    const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale,
+    T* dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = 0;
+    T db_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += dy[index] * x[index];
+      db_sum += dy[index];
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+template <typename DeviceContext, typename T>
+class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+    const T* x_d = x->data<T>();
+    const T* dy_d = dy->data<T>();
+    const T* s_d = scale->data<T>();
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* ds_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* db_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    const int block = 1024;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(max_threads / block, 1);
+    int grid1 = (num + block - 1) / block;
+    int grid2 = std::min(C, max_blocks);
+    if (layout == framework::DataLayout::kNCHW) {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    } else {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(affine_channel,
+                        ops::AffineChannelCUDAKernel<CUDA, float>,
+                        ops::AffineChannelCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(affine_channel_grad,
+                        ops::AffineChannelGradCUDAKernel<CUDA, float>,
+                        ops::AffineChannelGradCUDAKernel<CUDA, double>);
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -16,12 +16,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
@@ -31,9 +34,40 @@ class ClipByNormKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max_norm = context.Attr<T>("max_norm");
-    auto* input = context.Input<Tensor>("X");
+    auto in_var = context.InputVar("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
+    Tensor* output = nullptr;
+    const Tensor* input = nullptr;
+    if (in_var->IsType<framework::LoDTensor>()) {
+      input = context.Input<Tensor>("X");
+      output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+    } else if (in_var->IsType<SelectedRows>()) {
+      auto* x = context.Input<SelectedRows>("X");
+      // merge ids in selected rows first
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      SelectedRows* merged_input =
+          const_cast<framework::Scope&>(context.scope())
+              .Var()
+              ->GetMutable<SelectedRows>();
+      merge_func(context.template device_context<DeviceContext>(), *x,
+                 merged_input);
+      input = &(merged_input->value());
+      SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
+      output_selected_rows->set_rows(merged_input->rows());
+      output_selected_rows->set_height(merged_input->height());
+      output = output_selected_rows->mutable_value();
+      output->Resize(merged_input->value().dims());
+      output->mutable_data<T>(context.GetPlace());
+    } else {
+      PADDLE_THROW("Unexpected branch, input variable type is %s",
+                   in_var->Type().name());
+    }
+    PADDLE_ENFORCE_NOT_NULL(input);
    auto x = EigenVector<T>::Flatten(*input);
    auto out = EigenVector<T>::Flatten(*output);

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
-    bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
+    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
    int groups = ctx.Attr<int>("groups");
-    // TODO: add support for dilation  // NOLINT
+    // TODO(tpatejko): add support for dilation
    PADDLE_ENFORCE(
        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
        "dilation in convolution is not implemented yet");
@@ -369,11 +369,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
                                     strides, paddings, mkldnn_engine,
-                                     fuse_relu, fuse_eltwise);
+                                     fuse_relu, fuse_residual_conn);
    } else {
      conv_pd =
          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                               mkldnn_engine, fuse_relu, fuse_eltwise);
+                               mkldnn_engine, fuse_relu, fuse_residual_conn);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
    dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -386,9 +386,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_weights_memory_p = handler.AcquireWeightsMemory(
        user_weights_md, to_void_cast<T>(filter_data));
-    T* output_data = output->mutable_data<T>(
+    T* output_data = nullptr;
-        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
-        handler.GetDstMemorySize());
+    if (fuse_residual_conn) {
+      auto residual_param = ctx.Input<Tensor>("ResidualData");
+      auto residual_param_data = residual_param->data<T>();
+      PADDLE_ENFORCE(
+          residual_param_data != nullptr,
+          "Provide data if you want MKLDNN conv+elementwise_add fusion");
+      PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
+                        "Output and elementwise parameter need to have the "
+                        "same dimension sizes");
+      output->ShareDataWith(*residual_param);
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      output_data = output->mutable_data<T>(ctx.GetPlace(),
+                                            paddle::memory::Allocator::kDefault,
+                                            handler.GetDstMemorySize());
+    }
    // create reorder primitive if the input format is not the preferred one
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@@ -425,14 +442,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 private:
  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
-                                       bool fuse_eltwise) const {
+                                       bool fuse_residual_conn) const {
    mkldnn::primitive_attr conv_attr;
    mkldnn::post_ops post_operations;
    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_eltwise is true, the
+    // the scale parameter. It is assumed that when fuse_residual_connection is
-    // Output tensor contains the data coming from residual connection. The
+    // true, the output tensor contains the data coming from residual
-    // result of this post_op is: Output = scale * Output + Conv_Out.
+    // connection. The result of this post_op is:
-    if (fuse_eltwise) {
+    // Output = scale * Output + Conv_Out.
+    if (fuse_residual_conn) {
      post_operations.append_sum(1.0f);
    }
    // Fusion with ReLU layer is executed through the PostOps feature. Create a
@@ -453,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_eltwise) const {
+                       const bool fuse_residual_conn) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@@ -462,7 +480,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);
-    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, fuse_residual_conn);
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
@@ -477,7 +496,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_eltwise) const {
+                       const bool fuse_residual_conn) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@@ -486,7 +505,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        bias, dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);
-    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, fuse_residual_conn);
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -132,6 +132,11 @@ void Conv2DOpMaker::Make() {
            "(Tensor) The output tensor of convolution operator. "
            "The format of output tensor is also NCHW.")
      .Reuse("Input");
+  AddInput("ResidualData",
+           "(Tensor) Tensor with residual data "
+           "to which convolution output will be added."
+           "Used with fuse_residual_connection fusion.")
+      .AsDispensable();
  AddAttr<std::vector<int>>("strides",
                            "(vector<int> default:{1, 1}), the "
                            "strides(h_stride, w_stride) of "
@@ -164,10 +169,10 @@ void Conv2DOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
-  AddAttr<bool>("fuse_eltwise",
+  AddAttr<bool>("fuse_residual_connection",
                "(bool, default false) Only used in mkldnn kernel. Used "
-                "whenever convolution output is connected via skip connection "
+                "whenever convolution output is as an input to residual "
-                "to a previous layer.")
+                "connection.")
      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",

--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(
        ctx->HasInput("LearningRate"),
        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of DecayedAdagradOp should not be null.");

--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -20,7 +20,7 @@ detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
 detection_library(iou_similarity_op SRCS iou_similarity_op.cc
 iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -453,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Scores", "The scores of anchors should be foreground.");
+    AddInput("Scores",
-    AddInput("BboxDeltas", "bbox_deltas.");
+             "(Tensor) The scores from conv is in shape (N, A, H, W), "
-    AddInput("ImInfo", "Information for image reshape.");
+             "N is batch size, A is number of anchors, "
-    AddInput("Anchors", "All anchors.");
+             "H and W are height and width of the feature map");
-    AddInput("Variances", " variances");
+    AddInput("BboxDeltas",
+             "(Tensor) Bounding box deltas from conv is in "
-    AddOutput("RpnRois", "Anchors.");
+             "shape (N, 4*A, H, W).");
-    AddOutput("RpnRoiProbs", "Anchors.");
+    AddInput("ImInfo",
-    AddAttr<int>("pre_nms_topN", "pre_nms_topN");
+             "(Tensor) Information for image reshape is in shape (N, 3), "
-    AddAttr<int>("post_nms_topN", "post_nms_topN");
+             "in format (height, width, scale)");
-    AddAttr<float>("nms_thresh", "nms_thres");
+    AddInput("Anchors",
-    AddAttr<float>("min_size", "min size");
+             "(Tensor) Bounding box anchors from anchor_generator_op "
+             "is in shape (A, H, W, 4).");
+    AddInput("Variances",
+             "(Tensor) Bounding box variances with same shape as `Anchors`.");
+    AddOutput("RpnRois",
+              "(LoDTensor), Output proposals with shape (rois_num, 4).");
+    AddOutput("RpnRoiProbs",
+              "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
+    AddAttr<int>("pre_nms_topN",
+                 "Number of top scoring RPN proposals to keep before "
+                 "applying NMS.");
+    AddAttr<int>("post_nms_topN",
+                 "Number of top scoring RPN proposals to keep after "
+                 "applying NMS");
+    AddAttr<float>("nms_thresh", "NMS threshold used on RPN proposals.");
+    AddAttr<float>("min_size",
+                   "Proposal height and width both need to be greater "
+                   "than this min_size.");
    AddAttr<float>("eta", "The parameter for adaptive NMS.");
    AddComment(R"DOC(
-Generate Proposals OP
+This operator Generate bounding box proposals for Faster RCNN.
+The propoasls are generated for a list of images based on image
+score 'Scores', bounding box regression result 'BboxDeltas' as
+well as predefined bounding box shapes 'anchors'. Greedy
+non-maximum suppression is applied to generate the final bounding
+boxes.
-This operator proposes rois according to each box with their probability to be a foreground object and 
-the box can be calculated by anchors. Bbox_details and scores are the output of RPN. Final proposals
-could be used to train detection net.
-Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
-of anchors, H and W are height and width of the feature map.
-BboxDeltas is the differece between predicted box location and anchor location. In format of (N, 4*A, H, W)
-For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and 
- calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. 
-Finally, apply nms to get final proposals as output.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -417,7 +417,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    T *rpn_rois_data = rpn_rois->data<T>();
    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
-    auto &place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
    int64_t num_proposals = 0;
    std::vector<size_t> offset(1, 0);

--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
--- a/paddle/fluid/operators/detection/gpc.h
+++ b/paddle/fluid/operators/detection/gpc.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/***************************************************************************
+ *
+ * Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved
+ *
+ **************************************************************************/
+/**
+ * @file include/gpc.h
+ * @author huhan02(com@baidu.com)
+ * @date 2015/12/18 13:52:10
+ * @brief
+ *
+ * @modified by sunyipeng
+ * @email sunyipeng@baidu.com
+ * @date 2018/6/12
+ **/
+#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+namespace gpc {
+typedef enum {  // Set operation type
+  GPC_DIFF,     // Difference
+  GPC_INT,      // Intersection
+  GPC_XOR,      // Exclusive or
+  GPC_UNION     // Union
+} gpc_op;
+typedef struct {  // Polygon vertex structure
+  double x;       // Vertex x component
+  double y;       // vertex y component
+} gpc_vertex;
+typedef struct {       // Vertex list structure
+  int num_vertices;    // Number of vertices in list
+  gpc_vertex *vertex;  // Vertex array pointer
+} gpc_vertex_list;
+typedef struct {             // Polygon set structure
+  int num_contours;          // Number of contours in polygon
+  int *hole;                 // Hole  external contour flags
+  gpc_vertex_list *contour;  // Contour array pointer
+} gpc_polygon;
+typedef struct {           // Tristrip set structure
+  int num_strips;          // Number of tristrips
+  gpc_vertex_list *strip;  // Tristrip array pointer
+} gpc_tristrip;
+typedef enum { LEFT, RIGHT } gpc_left_right;
+typedef enum { ABOVE, BELOW } gpc_above_below;
+typedef enum { CLIP, SUBJ } gpc_clip_subj;
+typedef enum {      /* Edge intersection classes         */
+               NUL, /* Empty non-intersection            */
+               EMX, /* External maximum                  */
+               ELI, /* External left intermediate        */
+               TED, /* Top edge                          */
+               ERI, /* External right intermediate       */
+               RED, /* Right edge                        */
+               IMM, /* Internal maximum and minimum      */
+               IMN, /* Internal minimum                  */
+               EMN, /* External minimum                  */
+               EMM, /* External maximum and minimum      */
+               LED, /* Left edge                         */
+               ILI, /* Internal left intermediate        */
+               BED, /* Bottom edge                       */
+               IRI, /* Internal right intermediate       */
+               IMX, /* Internal maximum                  */
+               FUL  /* Full non-intersection             */
+} vertex_type;
+typedef enum {     /* Horizontal edge states            */
+               NH, /* No horizontal edge                */
+               BH, /* Bottom horizontal edge            */
+               TH  /* Top horizontal edge               */
+} h_state;
+typedef enum {              /* Edge bundle state                 */
+               UNBUNDLED,   /* Isolated edge not within a bundle */
+               BUNDLE_HEAD, /* Bundle head node                  */
+               BUNDLE_TAIL  /* Passive bundle tail node          */
+} bundle_state;
+typedef struct v_shape { /* Internal vertex list datatype     */
+  double x;              /* X coordinate component            */
+  double y;              /* Y coordinate component            */
+  struct v_shape *next;  /* Pointer to next vertex in list    */
+} vertex_node;
+typedef struct p_shape { /* Internal contour / tristrip type  */
+  int active;            /* Active flag / vertex count        */
+  int hole;              /* Hole / external contour flag      */
+  vertex_node *v[2];     /* Left and right vertex list ptrs   */
+  struct p_shape *next;  /* Pointer to next polygon contour   */
+  struct p_shape *proxy; /* Pointer to actual structure used  */
+} polygon_node;
+typedef struct edge_shape {
+  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
+  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
+  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
+  double xb;                     /* Scanbeam bottom x coordinate      */
+  double xt;                     /* Scanbeam top x coordinate         */
+  double dx;                     /* Change in x for a unit y increase */
+  int type;                      /* Clip / subject edge flag          */
+  int bundle[2][2];              /* Bundle edge flags                 */
+  int bside[2];                  /* Bundle left / right indicators    */
+  bundle_state bstate[2];        /* Edge bundle state                 */
+  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
+  struct edge_shape *prev;       /* Previous edge in the AET          */
+  struct edge_shape *next;       /* Next edge in the AET              */
+  struct edge_shape *pred;       /* Edge connected at the lower end   */
+  struct edge_shape *succ;       /* Edge connected at the upper end   */
+  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
+} edge_node;
+inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
+inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
+inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
+inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
+inline int gpc_optimal(gpc_vertex *v, int i, int n) {
+  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
+}
+inline int gpc_fwd_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
+}
+inline int gpc_not_fmax(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
+}
+inline int gpc_rev_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+inline int gpc_not_rmax(edge_node *v, int i, int n) {
+  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->prev;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->next;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+template <typename T>
+void gpc_malloc(T *&p, int b, char *s) {
+  if (b > 0) {
+    p = (T *)malloc(b);
+    if (!p) {
+      fprintf(stderr, "gpc malloc failure: %s\n", s);
+      exit(0);
+    }
+  } else {
+    p = NULL;
+  }
+}
+template <typename T>
+void gpc_free(T *&p) {
+  if (p) {
+    free(p);
+    p = NULL;
+  }
+}
+/*
+===========================================================================
+                       Public Function Prototypes
+===========================================================================
+*/
+void add_vertex(vertex_node **t, double x, double y);
+void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
+/*
+void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags,
+                      gpc_polygon *polygon);
+void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags,
+                       gpc_polygon *polygon);
+*/
+void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
+void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
+void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                       gpc_polygon *clip_polygon,
+                       gpc_tristrip *result_tristrip);
+void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
+void gpc_free_polygon(gpc_polygon *polygon);
+void gpc_free_tristrip(gpc_tristrip *tristrip);
+}  // namespace gpc
+#endif  // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef POLY_UTIL_H_
+#define POLY_UTIL_H_
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/gpc.h"
+namespace paddle {
+namespace operators {
+template <class T>
+class Point_ {
+ public:
+  // default constructor
+  Point_() {}
+  Point_(T _x, T _y) {}
+  Point_(const Point_& pt) {}
+  Point_& operator=(const Point_& pt);
+  // conversion to another data type
+  // template<typename _T> operator Point_<_T>() const;
+  // conversion to the old-style C structures
+  // operator Vec<T, 2>() const;
+  // checks whether the point is inside the specified rectangle
+  // bool inside(const Rect_<T>& r) const;
+  T x;  //!< x coordinate of the point
+  T y;  //!< y coordinate of the point
+};
+template <class T>
+void Array2PointVec(const T*& box, const size_t box_size,
+                    std::vector<Point_<T>>& vec);
+template <class T>
+void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly);
+template <class T>
+void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly);
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>& vec);
+template <class T>
+T GetContourArea(std::vector<Point_<T>>& vec);
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized);
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized);
+}  // namespace operators
+}  // namespace paddle
+#include "paddle/fluid/operators/detection/poly_util.cc"
+#endif  // POLY_UTIL_H_
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -41,9 +41,9 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
        for (int id_w = 0; id_w < width; ++id_w) {
          id = id_n * height * width + width * id_h + id_w;
          if (id_n % 2 == 0) {
-            out_data[id] = id_w - in_data[id];
+            out_data[id] = id_w * 4 - in_data[id];
          } else {
-            out_data[id] = id_h - in_data[id];
+            out_data[id] = id_h * 4 - in_data[id];
          }
        }
      }

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -32,9 +32,9 @@ __global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
  if (id_n < n && id_h < h && id_w < w) {
    int id = id_n * h * w + w * id_h + id_w;
    if (id_n % 2 == 0) {
-      output[id] = id_w - input[id];
+      output[id] = id_w * 4 - input[id];
    } else {
-      output[id] = id_h - input[id];
+      output[id] = id_h * 4 - input[id];
    }
  }
 }

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-  cc_test(varhandle_test SRCS varhandle_test.cc)
+  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
  return()
 endif()

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_lstm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py