diff --git a/.gitignore b/.gitignore
index 90138f996cf9cacc3c1cbff0cf2600eefca3f305..fa0c8882606b76ac71b43dcde7e1df6770c46c31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
+model_test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df00e977ebb547980e69ee421779c57717d771a9..ed704585d8a6bf3befd9a549aa5a62a33fea3da9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,13 +62,12 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
@@ -127,6 +126,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
   "A path setting fluid shared and static libraries")
 
+set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
+  "A path setting fluid inference shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
   message(WARNING "It is suggest not embedded a python interpreter in Paddle "
     "when using C-API. It will give an unpredictable behavior when using a "
@@ -176,6 +178,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@@ -298,3 +301,11 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
diff --git a/Dockerfile b/Dockerfile
index 738bba9bc2e1ab19709722fe04f1490b1b13bd4f..c8b9eed6d60e5d3b32fc14c0c7af80a785145d1b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
     pip3 install -U docopt PyYAML sphinx==1.5.6 && \
     pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
     easy_install -U pip && \
-    pip install -U wheel && \
+    pip install -U pip setuptools wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3 install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3 install opencv-python && \
-    pip install pre-commit 'ipython==5.3.0' && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip install opencv-python
 
diff --git a/README.md b/README.md
index 46fdef5e376d3f5bf49ef10c62f5b3a6637913c1..56d6c10c642787836abb55cb2974bda0b8d22da4 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
+### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==0.15.0.post87
+pip install paddlepaddle-gpu==1.1.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==0.15.0.post85
+pip install paddlepaddle-gpu==1.1.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 9540900b112f54594bbfdbc8d7cd3b6e1f5269dd..ff616ddbb2cb1cb7f348d6d164815823b08b7629 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -142,5 +142,10 @@ def parse_args():
         choices=['reduce', 'all_reduce'],
         default='all_reduce',
         help='Specify the reduce strategy, can be reduce, all_reduce')
+    parser.add_argument(
+        '--fuse_broadcast_op',
+        action='store_true',
+        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
+    )
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index ddd9fe809853a830ca676cc98f1819f683866def..5f3ce300acc44ad8d2898c27296b866c403f3cc8 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
     else:
         build_strategy.reduce_strategy = fluid.BuildStrategy(
         ).ReduceStrategy.AllReduce
+    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
 
     avg_loss = train_args[0]
 
@@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
 
             if args.use_fake_data or args.use_reader_op:
                 try:
-
                     fetch_ret = exe.run(fetch_list)
                 except fluid.core.EOFException as eof:
                     break
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
old mode 100644
new mode 100755
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c227e09719bd5f0e825f81fb96f78105aa10c79b
--- /dev/null
+++ b/cmake/external/xxhash.cmake
@@ -0,0 +1,50 @@
+INCLUDE(ExternalProject)
+
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  IF(APPLE)
+    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ELSE(APPLE)
+    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ENDIF(APPLE)
+ENDIF()
+
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+
+LIST(APPEND external_project_dependencies xxhash)
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 5bf82b4ddf10a646ca540ac4ee2cfd3d3bc6cf58..62227c67849dbb476339a176e0c98e295cbf529c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -261,6 +261,13 @@ function(cc_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} mklml)
         target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
       endif()
+      # remove link to python, see notes at:
+      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
+      if("${cc_library_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_library_DEPS python)
+        add_dependencies(${TARGET_NAME} python)
+        target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
+      endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
@@ -311,6 +318,8 @@ function(cc_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction(cc_test)
 
@@ -629,6 +638,8 @@ function(py_test TARGET_NAME)
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 077072f6eadb0c48f4ae32f94828613d89ed01c9..efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DSTS DEPS)
     cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
 
     list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
     list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -31,7 +31,7 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+        add_custom_command(TARGET ${TARGET} PRE_BUILD
           COMMAND mkdir -p "${dst}"
           COMMAND cp -r "${src}" "${dst}"
           COMMENT "copying ${src} -> ${dst}")
@@ -67,6 +67,13 @@ copy(boost_lib
   DEPS boost
 )
 
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
+
 if(NOT PROTOBUF_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
@@ -150,16 +157,16 @@ if (WITH_ANAKIN AND WITH_MKL)
         SRCS
         ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
         ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${src_dir}/${module}/api/paddle_inference_api.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
@@ -185,20 +192,41 @@ copy(cmake_cache
   SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
   DSTS ${FLUID_INSTALL_DIR})
 
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
+
+# Following commands generate a inference-only fluid library
+# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
+copy(third_party DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+)
+
+# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+copy(inference_api_lib DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)
+
+add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
 
 # paddle fluid version
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-set(version_file ${FLUID_INSTALL_DIR}/version.txt)
-file(WRITE ${version_file}
-  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-  "WITH_MKL: ${WITH_MKL}\n"
-  "WITH_GPU: ${WITH_GPU}\n")
-if(WITH_GPU)
-  file(APPEND ${version_file}
-    "CUDA version: ${CUDA_VERSION}\n"
-    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-endif()
+function(version version_file)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+  file(WRITE ${version_file}
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_GPU: ${WITH_GPU}\n")
+  if(WITH_GPU)
+    file(APPEND ${version_file}
+      "CUDA version: ${CUDA_VERSION}\n"
+      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+  endif()
+endfunction()
+version(${FLUID_INSTALL_DIR}/version.txt)
+version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 6653244507742b33d9524a7a0e4a5b2b575d358a..6b665a9effba4bef083d007c0c74f2f4c79e647e 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 212724a0c7fc06edf0599ddcaad33aba1fead879..3aeba53572885de85b27d3058054bf8ae06d48a6 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
-paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -84,7 +85,8 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
 paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@@ -95,17 +97,17 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@@ -114,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var
 paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
 paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
+paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
 paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
@@ -171,6 +174,12 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
+paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -349,6 +358,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind
 paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
+paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 48b36df6499e59fe742766b5f81fd30a9fb8b900..7d48f0057140cf021a21ea7e304b7e38cc8b9ec2 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 
-if(WITH_INFERENCE)
-  # NOTE: please add subdirectory inference at last.
-  add_subdirectory(inference)
-  add_subdirectory(train)
-endif()
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
+add_subdirectory(train)
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index 0dcecb62dba971b48c4f11c0ef47494be40eeea0..fabf2abfc803b8838edb48aa01ab8896799c97ac 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
     case proto::AttrType::LONG: {
       return attr_desc.l();
     }
+    case proto::AttrType::LONGS: {
+      std::vector<int64_t> val(attr_desc.longs_size());
+      for (int i = 0; i < attr_desc.longs_size(); ++i) {
+        val[i] = attr_desc.longs(i);
+      }
+      return val;
+    }
     default:
       PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
   }
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 14ca3e96209ed17f12e87fda8506806514698977..d9c76881b7e98d0b7cd29024b98c8f7720398c66 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -26,6 +26,113 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<std::vector<int64_t>> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  std::vector<int64_t>* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(std::vector<int>)) {  // NOLINT
+      std::vector<int> val = boost::get<std::vector<int>>(attr);
+      std::vector<int64_t> vec(val.begin(), val.end());
+      attr = vec;
+    } else if (attr.type() == typeid(std::vector<float>)) {  // NOLINT
+      std::vector<float> val = boost::get<std::vector<float>>(attr);
+      std::vector<int64_t> vec(val.begin(), val.end());
+      attr = vec;
+    }
+    std::vector<int64_t>* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<std::vector<int64_t>>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 template <typename T>
 inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
@@ -42,7 +149,11 @@ class AttrReader {
   inline const T& Get(const std::string& name) const {
     PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
                    name);
-    return boost::get<T>(attrs_.at(name));
+
+    Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
+    ExtractAttribute<T> extract_attr(name);
+    T* attr_value = extract_attr(attr);
+    return *attr_value;
   }
 
  private:
@@ -82,7 +193,7 @@ class DefaultValueSetter {
  public:
   explicit DefaultValueSetter(T default_value)
       : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }
+  void operator()(T& value) const { value = default_value_; }  // NOLINT
 
  private:
   T default_value_;
@@ -117,84 +228,6 @@ class EnumInContainer {
   std::unordered_set<T> container_;
 };
 
-template <typename T>
-struct ExtractAttribute {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  T* operator()(Attribute& attr) const {
-    T* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<T>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, paddle::platform::demangle(typeid(T).name()),
-                   paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-// special handle bool
-// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
-// hard to change the logic there. In another way, we should correct handle
-// if the user set `some_flag=1`.
-//
-// FIX ME anytime if there is a better solution.
-template <>
-struct ExtractAttribute<bool> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  bool* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<bool>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      float val = boost::get<float>(attr);
-      attr = static_cast<bool>(val);
-    }
-    bool* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<bool>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<int64_t> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  int64_t* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<int64_t>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      int val = boost::get<float>(attr);
-      attr = static_cast<int64_t>(val);
-    }
-    int64_t* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<int64_t>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -235,7 +268,7 @@ class TypedAttrChecker {
     return *this;
   }
 
-  void operator()(AttributeMap& attr_map) const {
+  void operator()(AttributeMap& attr_map) const {  // NOLINT
     if (!attr_map.count(attr_name_)) {
       // user do not set this attr
       PADDLE_ENFORCE(!default_value_setter_.empty(),
@@ -271,7 +304,7 @@ class OpAttrChecker {
     return *(checker.target<TypedAttrChecker<T>>());
   }
 
-  void Check(AttributeMap& attr_map) const {
+  void Check(AttributeMap& attr_map) const {  // NOLINT
     for (const auto& checker : attr_checkers_) {
       checker(attr_map);
     }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2..d8bc72e6b2fa38db06cb077ada9d7ec180299e8c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,12 +16,14 @@ if(WITH_GPU)
             dynload_cuda variable_visitor)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 
 else()
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
              variable_visitor)
     cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
 
 cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
@@ -33,13 +35,15 @@ if(WITH_GPU)
           all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 endif()
 
+cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
 if(WITH_GPU)
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass sequential_execution_pass)
 else()
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto sequential_execution_pass)
 endif()
 
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -54,8 +58,9 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
         DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass)
+        fuse_elewise_add_act_pass multi_batch_merge_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 7c5f5bd80a937bf1a1c891155764833d7b21c5c2..b8690156763e4037811245b8016982710445e6a2 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
       nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
-      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
     }
   }
 }
@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
         auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_[p];
+        auto *dev_ctx = dev_ctxes_.at(p);
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
           auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 4fdab5cd94358d08eac7f8b041bf16d09042f0bd..7f0d06c892541a2697a4ed083f6f4c0fc774a2a4 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -48,16 +48,27 @@ void BroadcastOpHandle::RunImpl() {
     var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
   }
 
+  BroadcastOneVar(*in_var_handle, out_var_handles, var_scopes);
+}
+
+void BroadcastOpHandle::BroadcastOneVar(
+    const VarHandle &in_var_handle,
+    const std::vector<VarHandle *> &out_var_handles,
+    const std::vector<const Scope *> &var_scopes) {
   auto *in_var =
-      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
+      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
   PADDLE_ENFORCE_NOT_NULL(in_var);
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+  if (UNLIKELY(!in_tensor.IsInitialized())) {
+    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
+    return;
+  }
 
-  InitOutputValue(*in_var_handle, out_var_handles);
+  InitOutputValue(in_var_handle, out_var_handles);
 
   if (platform::is_cpu_place(in_tensor.place())) {
     for (auto *out_var_handle : out_var_handles) {
-      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
+      if (out_var_handle->IsTheSameVar(in_var_handle)) {
         continue;
       }
       auto &out_p = out_var_handle->place_;
@@ -114,12 +125,12 @@ void BroadcastOpHandle::RunImpl() {
         }
       }
 
-      if (!out_handle->IsTheSameVar(*in_var_handle)) {
-        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+      if (!out_handle->IsTheSameVar(in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle.scope_idx_)
                            ->FindVar(out_var_handles[0]->name_);
         paddle::framework::TensorCopy(
-            in_tensor, in_var_handle->place_,
-            *(dev_ctxes_.at(in_var_handle->place_)),
+            in_tensor, in_var_handle.place_,
+            *(dev_ctxes_.at(in_var_handle.place_)),
             &VariableVisitor::GetMutableTensor(out_var));
       }
     });
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index fe4e733e43417977df324fde808f52b228a27d19..72180fac864256ddda076c57e50ab1083c113d32 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
         nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
       }
     }
   }
@@ -61,7 +62,10 @@ struct BroadcastOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
- private:
+  void BroadcastOneVar(const VarHandle &in_var_handle,
+                       const std::vector<VarHandle *> &out_var_handles,
+                       const std::vector<const Scope *> &var_scopes);
+
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index ab7412a19fbd13fa39dbae9af528d158cc9ddbd0..650de5a48de6b1fdab120cdeda563a169fd1a1c1 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -12,232 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-struct TestBroadcastOpHandle {
-  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
-  std::vector<Scope*> local_scopes_;
-  std::vector<Scope*> param_scopes_;
-  Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
-  std::vector<p::Place> gpu_list_;
-  bool use_gpu_;
-#ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
-#endif
-
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-#ifdef PADDLE_WITH_CUDA
-    if (nccl_ctxs_) {
-      nccl_ctxs_->WaitAll();
-    }
-#endif
-  }
-
-  void InitCtxOnGpu(bool use_gpu) {
-    use_gpu_ = use_gpu;
-    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
-      int count = p::GetCUDADeviceCount();
-      if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                        "device count is "
-                     << count;
-        exit(0);
-      }
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CUDAPlace(i);
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
-      }
-      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-      int count = 8;
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CPUPlace();
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
-      }
-#ifdef PADDLE_WITH_CUDA
-      nccl_ctxs_.reset(nullptr);
-#endif
-    }
-  }
-
-  void InitBroadcastOp(size_t input_scope_idx) {
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope& local_scope = local_scopes_.back()->NewScope();
-      *local_scopes_.back()
-           ->Var(details::kLocalExecScopeName)
-           ->GetMutable<Scope*>() = &local_scope;
-      local_scope.Var("out");
-      param_scopes_.emplace_back(&local_scope);
-    }
-    param_scopes_[input_scope_idx]->Var("input");
-
-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
-    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-                                             nccl_ctxs_.get()));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-                                             nccl_ctxs_.get()));
-#else
-      op_handle_.reset(
-          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
-#endif
-    }
-
-    std::unique_ptr<ir::Node> v =
-        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
-    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
-                                        gpu_list_[input_scope_idx]);
-    vars_.emplace_back(in_var_handle);
-    op_handle_->AddInput(in_var_handle);
-
-    // add dummy var
-
-    std::unique_ptr<ir::Node> v2 =
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v2.get()));
-    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddInput(dummy_var_handle);
-
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      if (!use_gpu_) {
-        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      }
-      std::unique_ptr<ir::Node> v3 =
-          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
-      VarHandle* out_var_handle =
-          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
-      vars_.emplace_back(out_var_handle);
-      op_handle_->AddOutput(out_var_handle);
-    }
-
-    // add dummy var
-    std::unique_ptr<ir::Node> v4 =
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v4.get()));
-    DummyVarHandle* out_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddOutput(out_dummy_var_handle);
-  }
-
-  void TestBroadcastLodTensor(size_t input_scope_idx) {
-    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-    f::LoD lod{{0, 10, 20}};
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
-    in_lod_tensor->set_lod(lod);
-    in_lod_tensor->Resize(kDims);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = param_scopes_[j]->FindVar("out");
-      PADDLE_ENFORCE_NOT_NULL(out_var);
-      auto out_tensor = out_var->Get<f::LoDTensor>();
-      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-
-      f::Tensor result_tensor;
-      f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
-      float* ct = result_tensor.mutable_data<float>(cpu_place);
-
-      for (int64_t i = 0; i < f::product(kDims); ++i) {
-        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
-      }
-    }
-  }
-
-  void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-    auto value = in_selected_rows->mutable_value();
-    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-    int height = static_cast<int>(kDims[0]) * 2;
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    in_selected_rows->set_height(height);
-    in_selected_rows->set_rows(rows);
-
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), value);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = param_scopes_[j]->FindVar("out");
-      PADDLE_ENFORCE_NOT_NULL(out_var);
-      auto& out_select_rows = out_var->Get<f::SelectedRows>();
-      auto rt = out_select_rows.value();
-
-      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
-                        "height is not equal.");
-      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
-      }
-
-      f::Tensor result_tensor;
-      f::TensorCopySync(rt, cpu_place, &result_tensor);
-      float* ct = result_tensor.data<float>();
-
-      for (int64_t i = 0; i < f::product(kDims); ++i) {
-        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
-      }
-    }
-  }
-};
-
 TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
   TestBroadcastOpHandle test_op;
   size_t input_scope_idx = 0;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a2a9ac328c4a9b89bfb89106af81b9fb3ed3028
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -0,0 +1,271 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> place_list_;
+  bool use_gpu_;
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        place_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        place_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitBroadcastOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[input_scope_idx]->Var("input");
+
+    std::unique_ptr<ir::Node> n =
+        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
+                                             place_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
+                                             place_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
+#endif
+    }
+
+    std::unique_ptr<ir::Node> v =
+        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        place_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+
+    std::unique_ptr<ir::Node> v2 =
+        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->ClearGeneratedOp();
+    op_handle_->AddInput(dummy_var_handle);
+
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
+      }
+      std::unique_ptr<ir::Node> v3 =
+          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
+      vars_.emplace_back(out_var_handle);
+      op_handle_->AddOutput(out_var_handle);
+    }
+
+    // add dummy var
+    std::unique_ptr<ir::Node> v4 =
+        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->ClearGeneratedOp();
+    op_handle_->AddOutput(out_dummy_var_handle);
+  }
+
+  std::vector<float> InitLoDTensor(const std::string& varname,
+                                   size_t input_scope_idx, const f::LoD& lod,
+                                   float val_scalar = 0.0) {
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto lod_tensor = var->GetMutable<f::LoDTensor>();
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k + val_scalar;
+    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), lod_tensor);
+    lod_tensor->set_lod(lod);
+    lod_tensor->Resize(kDims);
+    return send_vector;
+  }
+
+  std::vector<float> InitSelectedRows(const std::string& varname,
+                                      size_t input_scope_idx,
+                                      const std::vector<int64_t>& rows,
+                                      int height, float value_scalar = 0.0) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k + value_scalar;
+    }
+
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto selected_rows = var->GetMutable<f::SelectedRows>();
+    auto value = selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
+    selected_rows->set_height(height);
+    selected_rows->set_rows(rows);
+
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    return send_vector;
+  }
+
+  void SelectedRowsEqual(const std::string& varname, int input_scope_idx,
+                         const std::vector<float>& send_vector,
+                         const std::vector<int64_t>& rows, int height) {
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto& selected_rows = var->Get<f::SelectedRows>();
+    auto rt = selected_rows.value();
+    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
+
+    for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
+    }
+
+    p::CPUPlace cpu_place;
+    f::Tensor result_tensor;
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t i = 0; i < f::product(kDims); ++i) {
+      ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+    }
+  }
+
+  void LoDTensorEqual(const std::string& varname,
+                      const std::vector<float>& send_vec, const f::LoD& lod,
+                      framework::Scope* scope) {
+    p::CPUPlace cpu_place;
+    auto var = scope->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto tensor = var->Get<f::LoDTensor>();
+    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
+    f::Tensor result_tensor;
+    f::TensorCopySync(tensor, cpu_place, &result_tensor);
+    float* ct = result_tensor.mutable_data<float>(cpu_place);
+    for (int64_t k = 0; k < f::product(kDims); ++k) {
+      ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
+    }
+  }
+
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    f::LoD lod{{0, 10, 20}};
+    auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      LoDTensorEqual("out", send_vector, lod, param_scopes_[j]);
+    }
+  }
+
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    int height = static_cast<int>(kDims[0] * 2);
+    auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height);
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 6a6b497fa897e3882995688bf36704b1d77ea962..bc19bd36610bf144f163c8ebf582d4afbc6592e3 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  public:
   explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
       : ir::PassBuilder(), strategy_(strategy) {
+    if (strategy_.enable_sequential_execution_) {
+      AppendPass("sequential_execution_pass");
+    }
+
     // Add a graph viz pass to record a graph.
     if (!strategy_.debug_graphviz_path_.empty()) {
       auto viz_pass = AppendPass("graph_viz_pass");
@@ -110,6 +115,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "sequential_execution_pass") {
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     }
     graph = pass->Apply(std::move(graph));
   }
@@ -121,6 +131,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
+USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(sequential_execution_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 02c4bea16916d58a6d0fce8918f8fceb9ff9356e..88459320b0eb6d6c4405bff4c8b13c99aa7edb0d 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -69,6 +69,10 @@ struct BuildStrategy {
 
   bool enable_data_balance_{false};
 
+  bool enable_sequential_execution_{false};
+
+  bool fuse_broadcast_op_{false};
+
   // User normally doesn't need to call this API.
   // The PassBuilder allows for more customized insert, remove of passes
   // from python side.
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index b6282debdb4eb6b1f29c39e54ac4f3e2296838da..f9bbfe0016ce0ea0d15a83cb532c44518549b8ad 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
   bool need_wait =
       in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
   return need_wait;
 }
 
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 525d24322442ef4dd6e8c24212af61c908959b87..0b772f9b63e2cfb78175f5e0d7011db8e6a5ec20 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
   if (ctxs) {
     for (auto &p : places_) {
-      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+      this->SetDeviceContext(p, ctxs->DevCtx(p));
     }
   }
 }
@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
   PADDLE_ENFORCE_GT(places_.size(), 1,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 6e22fedf1c39428528c00cce4c9a4460dfb95cb3..98fc390e72fab3701538fd6f974460fa5114fdb0 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 
   size_t num_complete = 0;
   remaining_ = 0;
-  BlockingQueue<size_t> complete_q;
+  auto complete_q = std::make_shared<BlockingQueue<size_t>>();
   for (auto op : bootstrap_ops_) {
-    RunOpAsync(op_deps.get(), op, &complete_q);
+    RunOpAsync(op_deps.get(), op, complete_q);
   }
 
   while (num_complete != op_deps->size()) {
-    size_t num_comp = complete_q.Pop();
+    size_t num_comp = complete_q->Pop();
     if (num_comp == -1UL) {
       int remaining = 0;
       while (true) {
@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
           break;
         }
         for (int i = 0; i < remaining; ++i) {
-          complete_q.Pop();
+          complete_q->Pop();
         }
       }
       exception_.ReThrow();
@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+    OpHandleBase *op,
+    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
     OpHandleBase *op_to_run = op;
@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
             } else {
-              this->RunOpAsync(op_deps, pending_op, complete_q);
+              RunOpAsync(op_deps, pending_op, complete_q);
             }
           }
         }
@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
   atomic_op_deps_ = pool_.enqueue([&] {
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
-        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
     for (auto &pair : op_deps_) {
       (*op_deps)[pair.first] = pair.second;
     }
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index dad3a231cba6402f57ba654a9ac5fb520b9c8f04..8b8382447105c8caa36963214684d6ee9fa15200 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::atomic<int> remaining_;
 
   void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+                  OpHandleBase *op,
+                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
 
   void PrepareAtomicOpDeps();
 
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51dfa2d0711f49aaefab0af3549283dbf77eee4a
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -0,0 +1,55 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FusedBroadcastOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
+  if (places_.size() == 1UL) return;
+
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+  WaitInputVarGenerated();
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  size_t place_num = places_.size();
+  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
+
+  for (size_t i = 0; i < in_var_handles.size(); ++i) {
+    BroadcastOneVar(
+        *in_var_handles[i],
+        std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
+                                 out_var_handles.begin() + (i + 1) * place_num),
+        var_scopes);
+  }
+}
+
+std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..e37259526a5f6f57d51a0ca8bca96a18211a4790
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -0,0 +1,57 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FusedBroadcastOpHandle : public BroadcastOpHandle {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  FusedBroadcastOpHandle(ir::Node *node,
+                         const std::vector<Scope *> local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const platform::NCCLContextMap *nccl_ctx)
+      : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
+#else
+  FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
+                         const std::vector<platform::Place>& places)
+      : BroadcastOpHandle(node, local_scopes, places) {}
+#endif
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f12bd2b4e857648342aeb5ad33b6c0fe01c9c73
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -0,0 +1,165 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
+  std::vector<std::string> out_varnames_;
+
+  void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
+    // initialize scope and var
+    for (size_t i = 0; i < place_list_.size(); ++i) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
+        local_scope.Var("out_var" + j);
+        if (i == j) local_scope.Var("in_var" + j);
+      }
+      param_scopes_.emplace_back(&local_scope);
+    }
+
+    // create op handle node
+    std::unique_ptr<ir::Node> n =
+        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new FusedBroadcastOpHandle(
+          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not supported.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new FusedBroadcastOpHandle(
+          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(
+          new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
+#endif
+    }
+
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      // add input var handle
+      std::unique_ptr<ir::Node> in_node =
+          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
+      VarHandle* in_var_handle =
+          new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
+                        place_list_[input_scope_idxes[i]]);
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+
+      // add output var handle
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        std::unique_ptr<ir::Node> out_node =
+            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
+        VarHandle* out_var_handle =
+            new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
+        vars_.emplace_back(out_var_handle);
+        op_handle_->AddOutput(out_var_handle);
+      }
+    }
+  }
+
+  void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
+    std::vector<std::vector<float>> send_vec;
+    f::LoD lod{{0, 10, 20}};
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string varname("in_var" + i);
+      float val_scalar = static_cast<float>(i);
+      send_vec.push_back(
+          InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
+    }
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string& varname("out_var" + i);
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
+      }
+    }
+  }
+
+  void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
+    std::vector<std::vector<float>> send_vector;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    int height = static_cast<int>(kDims[0] * 2);
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string varname("in_var" + i);
+      float val_scalar = static_cast<float>(i);
+      send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
+                                             rows, height, val_scalar));
+    }
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string& varname("out_var" + i);
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
+                          height);
+      }
+    }
+  }
+};
+
+TEST(FusedBroadcastTester, CPULodTensor) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(false);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
+}
+
+TEST(FusedBroadcastTester, CPUSelectedRows) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(false);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(FusedBroadcastTester, GPULodTensor) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(true);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
+}
+
+TEST(FusedBroadcastTester, GPUSelectedRows) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(true);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 9aae19fc73de4387186da47c55710c94d53f1b88..ca4633c5a8f22fc9f7319b06aa766f9fe37dc68c 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
 
   VarHandle *out_var_handle;
   {
-    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
     PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                       "The number of output should be one.");
     out_var_handle = out_var_handles.front();
@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
   Tensor *out_tensor = out_value->mutable_value();
 
   // copy
-  auto dev_ctx = dev_ctxes_[out_var_handle->place_];
+  auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
   RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
                                              t_out_p] {
     int s = 0, e = 0;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 134fcee826715672a6e021e9bf694bb771ebb830..f3819887a196a7c8bf35897467bb9d68b428094e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -252,9 +253,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
   std::vector<ir::Node *> sorted_ret;
   for (size_t i = 0; i < ret.size(); ++i) {
     if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-          static_cast<int>(OpRole::kOptimize)) {
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kOptimize))) {
         optimize_ops.push_back(ret[i]);
       } else {
         sorted_ret.push_back(ret[i]);
@@ -347,7 +348,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
         auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name);
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -436,10 +437,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   if ((use_gpu &&
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
       is_dist_train) {
-    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-      auto &to_bcast_set = bcast_var_name_set[dev_id];
-      for (auto &bcast_name : to_bcast_set) {
-        CreateBroadcastOp(&result, bcast_name, dev_id);
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(&result, bcast_var_name_set);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(&result, bcast_name, dev_id);
+        }
       }
     }
   }
@@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
   }
 }
 
+void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
+    ir::Graph *result,
+    const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *op_handle = new FusedBroadcastOpHandle(
+      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_);
+#else
+  auto *op_handle = new FusedBroadcastOpHandle(
+      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_);
+#endif
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+  }
+
+  for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
+    for (auto &p_name : bcast_varnames[dev_id]) {
+      auto *in =
+          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back().get();
+      op_handle->AddInput(in);
+      for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
+        auto &p = places_[out_dev_id];
+        auto &vars =
+            result->Get<GraphVars>(kGraphVars).at(out_dev_id).at(p_name);
+        auto *out_var = new VarHandle(
+            result->CreateEmptyNode(p_name, ir::Node::Type::kVariable),
+            vars.size(), out_dev_id, p_name, p);
+        vars.emplace_back(out_var);
+        op_handle->AddOutput(out_var);
+      }
+    }
+  }
+}
+
 void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
@@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
 }
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
-    ir::Graph *result, const std::string &loss_grad_name) const {
+    ir::Graph *result, const std::string &loss_grad_name,
+    ir::Node *out_var_node) const {
   for (size_t i = 0; i < places_.size(); ++i) {
     // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
@@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(
-        result, op_handle,
-        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
-        places_[i], i);
+    CreateOpOutput(result, op_handle,
+                   result->CreateVarNode(out_var_node->Var()), places_[i], i);
   }
 }
 
@@ -680,7 +722,8 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   }
 
   if (node->Op()->Type() == "split_byref" ||
-      node->Op()->Type() == "split_selected_rows") {
+      node->Op()->Type() == "split_selected_rows" ||
+      node->Op()->Type() == "split_ids") {
     // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index cdf9f13cde608b546d17a1e53e0f6acea9e12566..03b2de2f04da4bac8d342a76c80fd12beaeba4b7 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                               size_t num_places) const;
 
   void CreateScaleLossGradOp(ir::Graph *result,
-                             const std::string &loss_grad_name) const;
+                             const std::string &loss_grad_name,
+                             ir::Node *out_var_node) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
@@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
+  void CreateFusedBroadcastOp(
+      ir::Graph *result,
+      const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
+
   bool IsSparseGradient(const std::string &og) const;
 
   size_t GetAppropriateDeviceID(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 3812f0abf1b7069525c4420054c61c01c908acfe..4822627ac3b65972f41d9a23d9fe3dba3de3f97d 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   for (auto *in : inputs_) {
     if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
     }
   }
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9fbefabc841e3f6940860f60d959fee97495e4c9..d09b94a3fd32952985a37cf4246c7640d2db4f56 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
   const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
   }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7fc06f234d42a992328c0b6164f17945d8075c28..4503123eac810917cabcf1e62cff98552ed2f742 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,7 +27,7 @@ namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index a6289b055f97b7b0e57928358d84117b33cf2df8..999828ae457ba43541da06088ce7c25331fd05ec 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
         nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
       }
     }
   }
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index f44b374edb29228dff5a8bf003d945291f166d49..65df7f2d510bf4e3e930398182c6dd1eae89241f 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
       continue;
     }
     if (in->GeneratedOp()) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
     }
   }
   auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ba243979b34aa1f683de707525403becaf0a1c00..ef1626599795a553e654fe5d3ed74ef3a3a67d78 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
       coeff_(static_cast<float>(1.0 / num_dev)),
       scope_(scope),
       place_(place) {
-  dev_ctxes_[place_] = dev_ctx;
+  this->SetDeviceContext(place_, dev_ctx);
 }
 
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
   } else {
 #ifdef PADDLE_WITH_CUDA
     this->RunAndRecordEvent([&] {
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-              ->stream();
+      auto stream = static_cast<platform::CUDADeviceContext *>(
+                        this->dev_ctxes_.at(place_))
+                        ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
       VLOG(10) << place_ << "RUN Scale loss grad op";
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc2c8bfef9f9f54c2e499467df0d22ce3f69d6b8
--- /dev/null
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  // FIXME(zjl): Insert dependencies between some distributed ops may cause
+  // the multi_devices_graph_pass fails. So we skip these ops here.
+  // Indeed, maybe we should not insert dependencies between these ops
+  // casually, which may cause deadlock easily.
+  // We should add more skipped distributed ops when found errors in
+  // multi_devices_graph_pass
+  static std::unordered_set<std::string> skip_dist_ops{
+      "send", "recv", "send_barrier", "fetch_barrier"};
+
+  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  std::vector<ir::Node *> op_node_list;
+  op_node_list.reserve(ops.size());
+
+  std::unordered_map<ir::Node *, size_t> op_deps;
+  std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
+  std::unordered_set<ir::Node *> ready_ops;
+
+  for (ir::Node *node : graph->Nodes()) {
+    if (!node->IsOp()) continue;
+    std::unordered_set<ir::Node *> preceding_ops;
+    for (auto *in : node->inputs) {
+      PADDLE_ENFORCE(in->IsVar(),
+                     "Preceding Node of Op Nodes must be Var Node");
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
+                     "Preceding Op Node of Var Node must be unique");
+      preceding_ops.insert(in->inputs[0]);
+      pending_ops[in->inputs[0]].insert(node);
+    }
+    op_deps[node] = preceding_ops.size();
+    if (preceding_ops.empty()) {
+      ready_ops.insert(node);
+    }
+  }
+
+  for (auto *op_desc : ops) {
+    ir::Node *found_node = nullptr;
+    for (auto *node : ready_ops) {
+      if (IsSameOpDesc(op_desc, node->Op())) {
+        PADDLE_ENFORCE(found_node == nullptr,
+                       "Found multiple op_desc in graph: %s", op_desc->Type());
+        found_node = node;
+      }
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
+                            op_desc->Type());
+    for (auto *pending_op : pending_ops[found_node]) {
+      if (--op_deps.at(pending_op) == 0) {
+        ready_ops.insert(pending_op);
+      }
+    }
+    ready_ops.erase(found_node);
+    if (skip_dist_ops.count(op_desc->Type()) == 0) {
+      op_node_list.push_back(found_node);
+    }
+  }
+
+  for (size_t i = 1; i < op_node_list.size(); ++i) {
+    auto *dep_var = graph->CreateControlDepVar();
+    op_node_list[i]->inputs.push_back(dep_var);
+    op_node_list[i - 1]->outputs.push_back(dep_var);
+    dep_var->outputs.push_back(op_node_list[i]);
+    dep_var->inputs.push_back(op_node_list[i - 1]);
+    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
+             << " and " << op_node_list[i]->Name();
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sequential_execution_pass,
+              paddle::framework::details::SequentialExecutionPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a04c08bc2eb3bae797d648b30a22a5fee7ba0eaa
--- /dev/null
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+class SequentialExecutionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 31beef3ae829d72570ee7c879dac71ed600cd216..dc63effd1b7c8fe5bb3fc91058eb855e552d3926 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
-  BlockingQueue<VarHandleBase *> ready_vars;
+  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
   std::unordered_set<OpHandleBase *> ready_ops;
   // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
   // streams from multiple GPUs, it's faster to buffer them and schedule
@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
+        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
       }
     }
   }
   for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, &ready_vars, var.get());
+    InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
   }
 
   for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   FeedFetchList fetch_data(fetch_tensors.size());
 
   InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+                 &pending_vars, ready_vars.get(), &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
       running_ops_++;
-      RunOp(&ready_vars, op);
+      RunOp(ready_vars, op);
     }
     set.clear();
   };
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   run_op_futures_.clear();
   exception_holder_.Clear();
   event.reset(nullptr);
-
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
     // 2. Find ready variable
     bool timeout;
-    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
+    auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
 
     if (timeout) {
       if (exception_holder_.IsCaught()) {
@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     }
   }
   PADDLE_ENFORCE(ready_ops.empty());
-
   // Wait FetchOps.
   ClearFetchOp(graph_.get(), &fetch_ops);
 
@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
-    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+    const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
+    details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
       if (VLOG_IS_ON(10)) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 512f8a4ca5a9b82a395dde11722b8db44ea5ec27..dbb0b498d995a897b109bd4ef98521b2193276ed 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ~ThreadedSSAGraphExecutor() {}
 
  private:
-  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
+  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
              details::OpHandleBase *op);
 
  private:
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index d8c2bc40b9458a1d5a7dd8a32277d04f69295f09..a1f458c660ce9f73bc9ac2ed194091ad0b8f8400 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -49,6 +49,8 @@ struct VarHandleBase {
 
   void AddOutput(OpHandleBase* out, ir::Node* node) {
     if (pending_ops_.find(out) == pending_ops_.end()) {
+      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
+                     this->Node()->Name());
       pending_ops_.insert(out);
       node_->outputs.push_back(node);
     }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index a070b8efb86088532daf304cd80ff7b8c608ac42..b212666637a5289c9c6cd3585655deaeed8afd4b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+
+  handler(op->Inputs());
+  handler(op->Outputs());
+
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     op->Run(*local_scope, place_);
 
     if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
-      for (auto& input : op->Inputs()) {
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+                          &(ctx->cur_ref_cnts_));
     }
 
     if (FLAGS_benchmark) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index f0cc1338a8af50030a70a9797cbcd1b0567272b5..36b36d49c2728dbef93042158dffa26d8f56d529 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
     const ProgramDesc& prog, size_t block_id) {
   auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
   std::unordered_map<std::string, T> ref_cnts;
 
-  for (auto var_desc : block.AllVars()) {
-    auto type = var_desc->Proto()->type().type();
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
-    }
-  }
-
-  for (auto op_desc : block.AllOps()) {
-    for (auto& input : op_desc->Inputs()) {
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS) {
+          continue;
         }
-      }
-    }
 
-    for (auto& output : op_desc->Outputs()) {
-      for (auto output_name : output.second) {
-        if (!ignored_vars.count(output_name)) {
-          if (ref_cnts.count(output_name))
-            ++ref_cnts[output_name];
-          else
-            ref_cnts[output_name] = 1;
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
         }
       }
     }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
   }
   return ref_cnts;
 }
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 25f0ba418433571343c5b2bbfdbf9fb940eaec52..efdabffb9b33ddf007c13008d0f3afb7a3961eda 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -35,6 +35,7 @@ enum AttrType {
   BLOCK = 8;
   LONG = 9;
   BLOCKS = 10;
+  LONGS = 11;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -55,6 +56,7 @@ message OpDesc {
     optional int32 block_idx = 12;
     optional int64 l = 13;
     repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
   };
 
   message Var {
@@ -80,7 +82,6 @@ message OpProto {
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
     optional bool dispensable = 5 [ default = false ];
-    optional string reuse = 6;
   }
 
   // AttrProto describes the C++ type Attribute.
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 796ce1f91ce6f3e21dc6f0af8fca4960d43f6e2b..28231a53bad50fe9f19cfe3e73c3dc09aa3762cf 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,7 +10,7 @@ function(pass_library TARGET DEST)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
     # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
     if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
         message(STATUS "add pass ${TARGET} ${DEST}")
@@ -25,20 +25,27 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
+cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if (WITH_MKLDNN)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
 pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
+pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
+pass_library(seqconv_eltadd_relu_fuse_pass inference)
+if(WITH_MKLDNN)
+    pass_library(mkldnn_placement_pass base)
+    pass_library(depthwise_conv_mkldnn_pass base)
+    pass_library(conv_bias_mkldnn_fuse_pass inference)
+    pass_library(conv_relu_mkldnn_fuse_pass inference)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
 
@@ -53,5 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 1c75cb5a82029b6a542a3a2f031a353f5e40f4ea..6090f1fe76a49dddad0640123b1fa4db8c489634 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -262,7 +262,7 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
   std::unordered_set<std::string> specified_vars({"data_lod_attention",
                                                   "cell_init", "hidden_init",
                                                   "data", "week", "minute"});
-  int count = 0;
+  size_t count = 0;
   for (auto* node : graph->Nodes()) {
     if (node->IsVar() && specified_vars.count(node->Name())) {
       ++count;
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..449cc78be15bcd2575ce2e6846b41e475f8921f6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename BinaryOperation>
+LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
+                               BinaryOperation f) {
+  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
+  LoDTensor vec_y;
+  vec_y.Resize(vec_a.dims());
+  const float* a = vec_a.data<float>();
+  const float* b = vec_b.data<float>();
+  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
+  for (int i = 0; i < vec_a.numel(); i++) {
+    y[i] = f(a[i], b[i]);
+  }
+  return vec_y;
+}
+
+std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
+  conv_bias_pattern(conv_input);
+  int found_conv_bias_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBias fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_bias_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
+    // bias
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
+    // output
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
+    // elementwise_add op
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
+    if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
+      VLOG(3) << "do not perform conv+bias fuse";
+      return;
+    }
+
+    auto* eltwise_bias_tensor =
+        scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
+
+    auto input_names = conv->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+      auto conv_bias_names = conv->Op()->Input("Bias");
+      // add eltwise bias to existing conv bias
+      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+      auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+      auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
+      *conv_bias_tensor = tensor_apply_eltwise(
+          *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
+
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({eltwise_out->Name()}));
+
+      GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out});
+
+      IR_NODE_LINK_TO(conv, eltwise_out);
+    } else {
+      // take eltwise bias as conv bias
+      OpDesc desc;
+
+      desc.SetInput(
+          "Input", std::vector<std::string>({subgraph.at(conv_input)->Name()}));
+      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
+      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
+      desc.SetType("conv2d");
+
+      for (auto& attr : conv->Op()->GetAttrMap()) {
+        desc.SetAttr(attr.first, attr.second);
+      }
+      auto conv_bias_node = g->CreateOpNode(&desc);
+
+      IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
+      IR_NODE_LINK_TO(conv_weight, conv_bias_node);
+      IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
+      IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
+
+      GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
+    }
+
+    found_conv_bias_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bias_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvBiasFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..5775b83b88730ec298c421a15f5c0b83c27b0750
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+* Fuse the Conv and Elementwise_add to a ConvBiasOp.
+*/
+class ConvBiasFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBiasFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 04459612a726bcb60f9d752dfd8927b6f5c2500d..846a14e365e6bd7f056d409130a3b246371931da 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -126,12 +126,21 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
     // conv, batch_norm,
     // conv_weight, conv_out,
     // bn_scale, bn_bias, bn_mean, bn_variance,
-    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance
+    // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,
+    // bn_saved_variance
     GET_CONV_BN_NODES(conv_bn_pattern);
 
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+bn fuse";
+      return;
+    }
+
     // Create eltwise_y (conv bias) variable
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetPersistable(true);
     auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
     auto* eltwise_y_in_tensor =
         scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
@@ -151,27 +160,59 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
                                *bn_mean, *bn_variance, eltwise_y_in_tensor,
                                epsilon);
 
-    // Create an elementwise add node
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    bool a = boost::get<bool>(conv->Op()->GetAttr("use_mkldnn"));
-    desc.SetAttr("use_mkldnn", a);
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph.get(), {bn_scale, bn_bias, bn_mean, bn_variance,
-                                       batch_norm, bn_mean_out, bn_variance_out,
-                                       bn_saved_mean, bn_saved_variance});
-
-    PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, bn_out);
-
-    found_conv_bn_count++;
+    // with MKL-DNN fuse conv+bn into conv with bias
+    // without MKL-DNN fuse conv+bn into conv+elementwise_add
+    if (fuse_option == FUSE_MKLDNN) {
+      auto input_names = conv->Op()->InputNames();
+      bool has_bias = std::find(input_names.begin(), input_names.end(),
+                                "Bias") != input_names.end();
+      if (has_bias && conv->Op()->Input("Bias").size() > 0) {
+        // reuse existing conv bias node
+        auto conv_bias_names = conv->Op()->Input("Bias");
+        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+        auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
+        auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
+        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
+                          eltwise_y_in_tensor->dims());
+
+        auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
+        eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
+      } else {
+        // add new conv_bias node
+        conv->Op()->SetInput(
+            "Bias", std::vector<std::string>({eltwise_y_in_node->Name()}));
+        IR_NODE_LINK_TO(eltwise_y_in_node, conv);
+      }
+      conv->Op()->SetOutput("Output",
+                            std::vector<std::string>({bn_out->Name()}));
+
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
+           bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
+
+      IR_NODE_LINK_TO(conv, bn_out);
+      found_conv_bn_count++;
+    } else {  // fuse_option == FUSE_NATIVE
+      // create an elementwise add node.
+      OpDesc desc;
+      desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+      desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+      desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
+      desc.SetType("elementwise_add");
+      desc.SetAttr("axis", 1);
+      auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+
+      GraphSafeRemoveNodes(
+          graph.get(),
+          {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+           bn_variance_out, bn_saved_mean, bn_saved_variance});
+
+      IR_NODE_LINK_TO(conv_out, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+      IR_NODE_LINK_TO(eltwise_op, bn_out);
+      found_conv_bn_count++;
+    }
   };
 
   gpd(graph.get(), handler);
@@ -237,7 +278,6 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
         {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
          bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
 
-    PADDLE_ENFORCE(subgraph.count(conv_input));
     IR_NODE_LINK_TO(eltwise, bn_out);
 
     found_conv_bn_count++;
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d0035ae98b093979eb8bbcc0a8d6ae5356d951f
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include <functional>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+
+// The function keeps the graph consistent by replacing
+// a node 'from' in the set of inputs nodes
+// of the visited node by a node 'to'.
+void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
+  for (auto& node : GraphTraits::DFS(*graph)) {
+    auto from_in_inputs =
+        std::find(std::begin(node.inputs), std::end(node.inputs), from);
+
+    if (from_in_inputs != std::end(node.inputs)) {
+      IR_NODE_LINK_TO(to, (&node));
+
+      auto inputs = node.Op()->Inputs();
+
+      using input_type = VariableNameMap::value_type;
+
+      std::for_each(std::begin(inputs), std::end(inputs),
+                    [from, to, &node](const input_type& i) -> void {
+                      auto param_names = i.second;
+                      auto pi = std::find(std::begin(param_names),
+                                          std::end(param_names), from->Name());
+
+                      if (pi != std::end(param_names)) {
+                        node.Op()->SetInput(i.first, {to->Name()});
+                      }
+                    });
+    }
+  }
+}
+}  // namespace
+using graph_ptr = std::unique_ptr<ir::Graph>;
+
+graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+
+  patterns::Conv conv_pattern{pattern, name_scope_};
+  auto conv_output = conv_pattern();
+
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  elementwise_add_pattern(conv_output);
+
+  conv_output->AsIntermediate();
+
+  auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> {
+    auto bias_input_names = conv_op.Op()->Inputs();
+    auto bias_it = bias_input_names.find("Bias");
+
+    if (bias_it != std::end(bias_input_names)) {
+      bool has_bias = !bias_it->second.empty();
+
+      if (has_bias) {
+        auto conv_bias_names = bias_it->second;
+        auto conv_bias_names_it =
+            std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
+                         [&conv_bias_names](Node* n) -> bool {
+                           return n->Name() == conv_bias_names[0];
+                         });
+        return std::make_pair(has_bias, *conv_bias_names_it);
+      }
+    }
+
+    return std::make_pair(false, nullptr);
+  };
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+
+    OpDesc op_desc;
+    op_desc.SetType("conv2d");
+
+    op_desc.SetInput("Input", {conv_input->Name()});
+    op_desc.SetInput("Filter", {conv_filter->Name()});
+    op_desc.SetInput("ResidualData", {elementwise_add_x->Name()});
+    op_desc.SetOutput("Output", {conv_output->Name()});
+
+    bool has_bias;
+    Node* conv_bias;
+
+    std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op);
+
+    if (has_bias) {
+      op_desc.SetInput("Bias", {conv_bias->Name()});
+    }
+
+    for (const auto& attr : conv_op->Op()->GetAttrMap()) {
+      op_desc.SetAttr(attr.first, attr.second);
+    }
+
+    op_desc.SetAttr("fuse_residual_connection", true);
+
+    auto fused_conv_op = g->CreateOpNode(&op_desc);
+
+    IR_NODE_LINK_TO(conv_input, fused_conv_op);
+    IR_NODE_LINK_TO(conv_filter, fused_conv_op);
+    IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
+    IR_NODE_LINK_TO(fused_conv_op, conv_output);
+
+    if (has_bias) {
+      IR_NODE_LINK_TO(conv_bias, fused_conv_op);
+    }
+
+    CorrectGraphEdges(g, elementwise_add_out, conv_output);
+    GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op});
+  };
+
+  gpd(graph.get(), handler);
+
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4a899f1adb5e993895a40a8cfb846a67b41bb22
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"residual_connections_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..348a3dfc5da78e860742595a60a0b7a8b2d92243
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,247 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+constexpr int nodes_removed = 3;
+constexpr int nodes_added = 1;
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::pair<std::string, std::string>>& inputs,
+           const std::pair<std::string, std::string>& output) {
+  auto op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", true);
+
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+
+  op->SetOutput(output.first, {output.second});
+}
+
+struct IsReachable {
+  using func = std::function<bool(const std::string&, const std::string&)>;
+
+  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
+    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
+                        const std::string& name) -> Node* {
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        if (name == node.Name()) {
+          return &node;
+        }
+      }
+
+      return nullptr;
+    };
+
+    return [&](std::string from, const std::string to) -> bool {
+      if (from == to) return true;
+
+      std::map<std::string, bool> visited;
+
+      for (auto& node : GraphTraits::DFS(*graph)) {
+        visited[node.Name()] = false;
+      }
+
+      visited[from] = true;
+
+      std::list<std::string> queue;
+      queue.push_back(from);
+
+      while (!queue.empty()) {
+        auto cur = find_node(graph, queue.front());
+        queue.pop_front();
+
+        if (cur == nullptr) return false;
+
+        for (auto n : cur->outputs) {
+          if (n->Name() == to) return true;
+
+          if (!visited[n->Name()]) {
+            visited[n->Name()] = true;
+            queue.push_back(n->Name());
+          }
+        }
+      }
+      return false;
+    };
+  }
+};
+
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+  int conv_count = 0;
+  int elementwise_add_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      ++conv_count;
+    }
+    if (node->IsOp() && node->Op()->Type() == "elementwise_add") {
+      ++elementwise_add_count;
+    }
+  }
+  EXPECT_EQ(conv_count, 1);
+  EXPECT_EQ(elementwise_add_count, 0);
+}
+
+ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
+                             const std::vector<std::string>& persistent_vars) {
+  ProgramDesc prog;
+
+  auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
+    auto var = prog.MutableBlock(0)->Var(var_name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+
+    return var;
+  };
+
+  for (const auto& v : transient_vars) {
+    add_var_to_prog(v);
+  }
+
+  for (const auto& v : persistent_vars) {
+    auto var = add_var_to_prog(v);
+    var->SetPersistable(true);
+  }
+
+  return prog;
+}
+}  // namespace
+
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+
+  AssertOpsCount(graph);
+}
+
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
+  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  IsReachable is_reachable;
+
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+
+  AssertOpsCount(graph);
+}
+
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "b"});
+  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  IsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+
+TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+  auto prog =
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
+  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  IsReachable is_reachable;
+
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+
+  auto pass =
+      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+
+  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+            current_nodes_num);
+  AssertOpsCount(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
index d7df6389cfd595324e284e0da10f65213ccee80f..e359a3832ee8d549f8c58d63bc1cc6564ecadede 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -46,6 +46,12 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
     GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
     GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
 
+    FuseOptions fuse_option = FindFuseOption(*conv, *relu);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+relu fuse";
+      return;
+    }
+
     // Transform Conv node into ConvReLU node.
     OpDesc* desc = conv->Op();
     desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
index b5de0d548713772e7ad41cfb6d8b3e9460683efb..fe585bd7c41bb32ae00462e989ab4c0051fc89a8 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
   virtual ~ConvReLUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
index 9dd780ec89ab991d6d99cb66fa2a9b683be2b9ca..19248b4dfee1da81d18cd2effac08ba68dde80fb 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -15,25 +15,30 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-void SetOp(ProgramDesc* prog, const std::string& type,
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
            const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
   if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("use_mkldnn", use_mkldnn);
+    op->SetAttr("name", name);
     op->SetInput("Input", {inputs[0]});
     op->SetInput("Filter", {inputs[1]});
     op->SetInput("Bias", {inputs[2]});
   } else if (type == "relu") {
+    op->SetAttr("use_mkldnn", use_mkldnn);
     op->SetInput("X", inputs);
   }
   op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }
 
 // a->OP0->b
@@ -43,7 +48,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
 ProgramDesc BuildProgramDesc() {
   ProgramDesc prog;
   for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
     auto* var = prog.MutableBlock(0)->Var(v);
     var->SetType(proto::VarType::SELECTED_ROWS);
     if (v == "weights" || v == "bias") {
@@ -51,14 +57,24 @@ ProgramDesc BuildProgramDesc() {
     }
   }
 
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+  SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
         std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+  SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
         std::vector<std::string>({"c"}));
-  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
-        std::vector<std::string>({"f"}));
-  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
-        std::vector<std::string>({"g"}));
+  // conv+relu, both with MKL-DNN
+  SetOp(&prog, "conv2d", "conv1",
+        std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}), true);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), true);
+  SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}));
+  // conv+relu, only one with MKL-DNN
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}));
 
   return prog;
 }
@@ -88,10 +104,16 @@ TEST(ConvReLUFusePass, basic) {
       auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("fuse_relu"));
-      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
-      if (fuse_relu) {
-        ++conv_relu_count;
+      // check if only "conv1" convolution is fused
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv1") {
+        ASSERT_TRUE(op->HasAttr("fuse_relu"));
+        bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+        if (fuse_relu) {
+          ++conv_relu_count;
+        }
+      } else if (op_name == "conv2") {
+        ASSERT_FALSE(op->HasAttr("fuse_relu"));
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+  GraphPatternDetector gpd;
+
+  auto* pattern = gpd.mutable_pattern();
+  pattern->NewNode("depthwise_conv")
+      ->assert_is_op("depthwise_conv2d")
+      ->assert_op_attr("use_mkldnn", true);
+
+  int found_depthwise_conv_mkldnn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    GET_NODE(depthwise_conv, (*pattern));
+    depthwise_conv->Op()->SetType("conv2d");
+    found_depthwise_conv_mkldnn_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_depthwise_conv_mkldnn_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(depthwise_conv_mkldnn_pass,
+              paddle::framework::ir::DepthwiseConvMKLDNNPass);
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ca6a7325186401c26eb7f9375cf83b7b97cc1c9
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DepthwiseConvMKLDNNPass : public FusePassBase {
+ public:
+  virtual ~DepthwiseConvMKLDNNPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09d0b15f46a7e50afb6aea46383013ce6a6c6118
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  op->SetInput("Input", {inputs[0]});
+  op->SetInput("Filter", {inputs[1]});
+  op->SetInput("Bias", {inputs[2]});
+  op->SetOutput("Out", outputs);
+}
+
+// (a, weights, bias)->depthwise conv mkldnn->b
+// (b, weights2, bias2)->depthwise conv no mkldnn->c
+// (c, weights3, bias3)->conv mkldnn->d
+// (d, weights3, bias3)->conv no mkldnn->e
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
+            "weights3", "bias3", "weights4", "bias4"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
+        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // depthwise conv with MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv1",
+        std::vector<std::string>({"a", "weights", "bias"}),
+        std::vector<std::string>({"b"}), true);
+  // depthwise conv without MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv2",
+        std::vector<std::string>({"b", "weights2", "bias2"}),
+        std::vector<std::string>({"c"}), false);
+  // conv with MKL-DNN
+  SetOp(&prog, "conv2d", "conv3",
+        std::vector<std::string>({"c", "weights3", "bias3"}),
+        std::vector<std::string>({"d"}), true);
+  // conv without MKL-dNN
+  SetOp(&prog, "conv2d", "conv4",
+        std::vector<std::string>({"d", "weights4", "bias4"}),
+        std::vector<std::string>({"e"}), false);
+
+  return prog;
+}
+
+TEST(DepthwiseConvMKLDNNPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+
+  struct counters {
+    int mkldnn_depthwise_conv_nodes;
+    int other_depthwise_conv_nodes;
+    int mkldnn_conv_nodes;
+    int other_conv_nodes;
+  };
+
+  counters before{1, 1, 1, 1};
+
+  graph = pass->Apply(std::move(graph));
+
+  // initialize counters before loop
+  counters after{0, 0, 0, 0};
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_conv_nodes++;
+        else
+          after.other_conv_nodes++;
+      } else if (op->Type() == "depthwise_conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_depthwise_conv_nodes++;
+        else
+          after.other_depthwise_conv_nodes++;
+      }
+    }
+  }
+
+  EXPECT_EQ(after.other_depthwise_conv_nodes,
+            before.other_depthwise_conv_nodes);
+  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
+  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
+            before.mkldnn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(depthwise_conv_mkldnn_pass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 06286a109d01af638e74e06ccc83e2a5500663ea..2db7d95cae1c8c59691fd642e2462e92ed58814f 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
     op->SetInput("X", inputs);
   }
   op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }
 
 // a->OP0->b
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d70010089e4b4fbb4542ef7748b8e9ece48d3942
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FusePassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+Scope* FusePassBase::param_scope() const {
+  PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+  return graph_->Get<framework::Scope*>(kParamScopeAttr);
+}
+
+void FusePassBase::AddStatis(int count_of_fused) const {
+  PADDLE_ENFORCE(graph_);
+  PADDLE_ENFORCE(!repr_.empty());
+  if (!graph_->Has(kFuseStatisAttr)) {
+    graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
+  }
+  auto& info =
+      graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
+  info[repr_] = count_of_fused;
+}
+
+FuseOptions FusePassBase::FindFuseOption(const Node& node1,
+                                         const Node& node2) const {
+#ifdef PADDLE_WITH_MKLDNN
+  bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node1.Op()->GetAttr("use_mkldnn"));
+  bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") &&
+                      boost::get<bool>(node2.Op()->GetAttr("use_mkldnn"));
+  if (node1_mkldnn && node2_mkldnn)
+    return FUSE_MKLDNN;
+  else if (!node1_mkldnn && !node2_mkldnn)
+    return FUSE_NATIVE;
+  else
+    return DO_NOT_FUSE;
+#else
+  return FUSE_NATIVE;
+#endif
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index 877bbeb502252cac77095981641d7ce283ca1eb7..c53b2a6186741d86f14faf1d21fa19aa09cec036 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -25,32 +25,24 @@ namespace ir {
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
 
+enum FuseOptions {
+  DO_NOT_FUSE,  // fusing will not be done
+  FUSE_NATIVE,  // fusing will be done without MKL-DNN
+  FUSE_MKLDNN   // fusing will be done with MKL-DNN
+};
+
 class FusePassBase : public Pass {
  public:
-  void Init(const std::string& repr, Graph* graph) const {
-    repr_ = repr;
-    graph_ = graph;
-  }
-
-  Scope* param_scope() const {
-    PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
-    return graph_->Get<framework::Scope*>(kParamScopeAttr);
-  }
-
-  void AddStatis(int count_of_fused) const {
-    PADDLE_ENFORCE(graph_);
-    PADDLE_ENFORCE(!repr_.empty());
-    if (!graph_->Has(kFuseStatisAttr)) {
-      graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
-    }
-    auto& info =
-        graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
-    info[repr_] = count_of_fused;
-  }
+  void Init(const std::string& repr, Graph* graph) const;
+  Scope* param_scope() const;
+  void AddStatis(int count_of_fused) const;
 
   virtual ~FusePassBase() {}
 
  protected:
+  virtual FuseOptions FindFuseOption(const Node& node1,
+                                     const Node& node2) const;
+
   mutable Graph* graph_;
   mutable std::string repr_;
 };
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 398f7095968e62f92d610f560d7574b27706d13e..813f620d7ceb6f341e5fa6e96fb5ca0face96f51 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,80 +23,78 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-
-std::vector<std::string> FindDistTrainSendVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->InputArgumentNames();
-    send_vars.reserve(send_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return send_vars;
-}
-
-std::vector<std::string> FindDistTrainRecvVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> recv_vars;
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->OutputArgumentNames();
-    recv_vars.reserve(recv_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return recv_vars;
-}
-
-bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                   const std::vector<std::string> &recv_vars) {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
+namespace {
+
+void CheckProgram(const ProgramDesc &program) {
+  std::map<int, bool> visit;
+#define _INT(role) static_cast<int>(role)
+
+  for (size_t i = 0; i < program.Size(); ++i) {
+    for (OpDesc *op : program.Block(i).AllOps()) {
+      // For backward compatibility, some program doesn't have role added.
+      if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+      int role_id = boost::get<int>(
+          op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+      visit[role_id] = true;
+      switch (role_id) {
+        case _INT(OpRole::kForward):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kBackward)) == visit.end(),
+              "Cannot add forward operator before backward operator.");
+          break;
+        case _INT(OpRole::kBackward):
+        case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                    _INT(OpRole::kLoss)) == visit.end(),
+                         "Cannot add backward|loss operator before "
+                         "forward|loss operator.");
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kOptimize):
+        case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                         "Optimize operators must follow backward operator.");
+          break;
+        case _INT(OpRole::kLRSched):
+        case _INT(OpRole::kDist):
+        case _INT(OpRole::kRPC):
+        case _INT(OpRole::kNotSpecified):
+          break;
+        default:
+          LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                        "you don't know what you are doing.";
       }
     }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
   }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
+#undef _INT
 }
+}  // namespace
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  CheckProgram(program_);
   // Make the nodes id start from 0.
   Node::ResetId();
+  auto var_nodes = InitFromProgram(program_);
+  ResolveHazard(var_nodes);
+}
 
+std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
+    const ProgramDesc &program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
+  // var nodes for each var name, will have multiple versions in SSA
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *var : program.Block(0).AllVars()) {
     all_vars.emplace(var->Name(), var);
   }
 
-  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
     // For input args, reuse the same var name if it was created before.
@@ -134,7 +132,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       var->inputs.push_back(node);
     }
   }
+  return std::move(var_nodes);
+}
 
+void Graph::ResolveHazard(
+    const std::map<std::string, std::vector<ir::Node *>> &var_nodes) {
   /**
    * We should handle write after read(WAR) and write after write(WAW) here.
    * Because some of the operators of the program can be executed parallelly.
@@ -153,6 +155,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     auto it_old = versions.rbegin();
     ++it_old;
     for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      VLOG(3) << "deal with var: " << (*it_new)->Name();
       ir::Node *write_op =
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index ab687e760a761d4e445726bd5149966adc2403d0..9d7aa5d32deb274fbf29481b0d4754c05d1e21b5 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -160,6 +160,12 @@ class Graph {
     return nullptr;
   }
 
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
  private:
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index c54766d95a61ac1a4b61566c6de62cbc86685a1d..01e878089171e4620f32b57a65d92d1c86d307db 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
   std::deque<ir::Node *> q_nodes;
   std::vector<std::unordered_set<ir::Node *>> graph_nodes;
   std::unordered_set<ir::Node *> g_nodes;
+  // q_set used to record records in the queue.
+  std::unordered_set<ir::Node *> q_set;
   size_t graph_count = 0;
 
-  auto traverse_nodes = [&visited_nodes,
-                         &q_nodes](const std::vector<ir::Node *> &nodes) {
-    std::copy_if(
-        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
-        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  auto traverse_nodes = [&visited_nodes, &q_nodes,
+                         &q_set](const std::vector<ir::Node *> &nodes) {
+    for (auto n : nodes) {
+      if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
+        q_nodes.push_back(n);
+        q_set.insert(n);
+      }
+    }
   };
 
   while (visited_nodes.size() != nodes.size()) {
     if (!q_nodes.empty()) {
       auto cur_node = q_nodes.front();
       q_nodes.pop_front();
+      q_set.erase(cur_node);
       visited_nodes.insert(cur_node);
       g_nodes.insert(cur_node);
       traverse_nodes(cur_node->inputs);
@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
       for (auto &n : nodes) {
         if (visited_nodes.count(n) == 0) {
           q_nodes.push_back(n);
+          q_set.insert(n);
           break;
         }
       }
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index cea902809339f9d45b0e2525163f08a3c1c44c95..260a73ae763bd2cdea9948e4d928377a7c718dda 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) {
 
   Graph g(prog);
   BuildZeroGraph(&g);
-  ASSERT_EQ(GraphNum(g), 0);
+  ASSERT_EQ(GraphNum(g), 0UL);
 
   Graph g2(prog);
   BuildOneGraph(&g2);
-  ASSERT_EQ(GraphNum(g2), 1);
+  ASSERT_EQ(GraphNum(g2), 1UL);
 
   Graph g3(prog);
   BuildTwoGraphs(&g3);
-  ASSERT_EQ(GraphNum(g3), 2);
+  ASSERT_EQ(GraphNum(g3), 2UL);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 8625b562e7dfab5a65692863cdc22b62ce15d758..b20d70132256bd5df7411c46ff4eb246b1f14ba8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -259,18 +259,33 @@ GraphPatternDetector::DetectPatterns() {
   return result;
 }
 
+bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+                  const std::pair<PDNode *, Node *> &b) {
+  if (a.first != b.first) {
+    return a.first < b.first;
+  } else {
+    return a.second < b.second;
+  }
+}
+
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
     std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
   if (subgraphs->empty()) return;
   std::vector<GraphPatternDetector::subgraph_t> result;
 
   std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
   for (auto &g : *subgraphs) {
-    size_t key = 0;
-    for (auto &item : g) {
-      key ^= std::hash<void *>{}(item.first);
-      key ^= std::hash<void *>{}(item.second);
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << item.first << ":" << item.second;
     }
+    auto key = hasher(ss.str());
     if (!set.count(key)) {
       result.emplace_back(g);
       set.insert(key);
@@ -759,6 +774,51 @@ PDNode *patterns::ConvReLU::operator()(
   return relu_out_var;
 }
 
+PDNode *patterns::SeqConvEltAddRelu::operator()(
+    paddle::framework::ir::PDNode *seqconv_input) {
+  // Create Operators
+  seqconv_input->assert_is_op_input("sequence_conv", "X");
+  auto *seqconv_op = pattern->NewNode(seqconv_repr())
+                         ->assert_is_op("sequence_conv")
+                         ->assert_op_attr<bool>("paddingTrainable", false)
+                         ->assert_op_attr<int>("contextStride", 1);
+
+  auto *eltadd_op =
+      pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add");
+  auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto *seqconv_weight_var =
+      pattern->NewNode(seqconv_weight_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("sequence_conv", "Filter");
+  // Bias
+  auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr())
+                              ->AsIntermediate()
+                              ->assert_is_only_output_of_op("sequence_conv")
+                              ->assert_is_op_input("elementwise_add");
+  auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr())
+                             ->AsIntermediate()
+                             ->assert_is_only_output_of_op("elementwise_add")
+                             ->assert_is_only_input_of_op("relu");
+  // output
+  auto *relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+
+  seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var})
+      .LinksTo({seqconv_out_var});
+  eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var})
+      .LinksTo({eltadd_out_var});
+  relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
+
 PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
                                  bool with_bias) {
   // Create shared nodes.
@@ -964,6 +1024,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ConvBias::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *eltiwse_op =
+      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  // Create variables
+  // Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("elementwise_add");
+  // Bias stored in elementwise_add
+  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
+                               ->AsInput()
+                               ->assert_is_persistable_var()
+                               ->assert_is_op_input("elementwise_add", "Y");
+  // output
+  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("elementwise_add");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
+      .LinksTo({eltwise_out_var});
+  return eltwise_out_var;
+}
+
+PDNode *patterns::Conv::operator()() {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+
+  conv_op->LinksFrom({input_var, filter_var});
+  conv_op->LinksTo({output_var});
+
+  return output_var;
+}
+
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+
+  x_var->assert_is_op_input("elementwise_add", "X");
+
+  auto y_var = pattern->NewNode(elementwise_add_x_repr())
+                   ->AsInput()
+                   ->assert_is_op_input("elementwise_add", "Y");
+
+  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+                     ->AsOutput()
+                     ->assert_is_op_output("elementwise_add", "Out");
+
+  elementwise_add_op->LinksFrom({x_var, y_var});
+  elementwise_add_op->LinksTo({out_var});
+
+  return out_var;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index cdd6413d968b065453177ff78b0aad641a09f6e7..9e462ac671ee931fc17a31f32a76049a0990341f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -128,6 +128,15 @@ struct PDNode {
       const std::unordered_set<std::string>& op_types,
       const std::string& argument, int nth);
 
+  template <typename T>
+  PDNode* assert_op_attr(const std::string& attr_name, const T& attr) {
+    asserts_.emplace_back([=](Node* x) {
+      return x && x->IsOp() && x->Op()->HasAttr(attr_name) &&
+             boost::get<T>(x->Op()->GetAttr(attr_name)) == attr;
+    });
+    return this;
+  }
+
  private:
   PDNode(PDPattern* pattern, const std::string& name = "",
          Type type = Type::kVar)
@@ -434,6 +443,31 @@ struct ConvReLU : public PatternBase {
   PATTERN_DECL_NODE(relu_out);
 };
 
+// SEQCONV with Elementwise_Add ReLU
+// op: seqconv + elementwise_add + relu
+// named nodes:
+// seqconv_input, seqconv_weight,
+// seqconv_out, seqconv,
+// elementwise_add_bias, elementwise_add_out, elementwise_add
+// relu_out, relu
+struct SeqConvEltAddRelu : public PatternBase {
+  SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {}
+
+  PDNode* operator()(PDNode* seqconv_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(seqconv);
+  PATTERN_DECL_NODE(eltadd);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(seqconv_weight);
+  PATTERN_DECL_NODE(seqconv_out);
+  PATTERN_DECL_NODE(eltadd_bias);
+  PATTERN_DECL_NODE(eltadd_out);
+  PATTERN_DECL_NODE(relu_out);
+};
+
 // FC with bias
 // op: mul + elementwise_add
 // named nodes:
@@ -578,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(d_ele_y);
   PATTERN_DECL_NODE(ele_y);
 };
+
+// Conv with Elementwise_add as bias
+// op: conv + elementwise_add
+// named nodes:
+// conv_input, conv_weight,
+// conv_out, conv,
+// eltwise_bias, eltwise_out,
+// elementwise_add
+struct ConvBias : public PatternBase {
+  ConvBias(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bias") {}
+  PDNode* operator()(PDNode* conv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(eltwise);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(eltwise_bias);
+  PATTERN_DECL_NODE(eltwise_out);
+};
+
+// Convolution op
+// Forward pass for convolution.
+// conv_input, conv_bias and conv_filter are inputs.
+// conv_output is a result of the operator.
+// residual_data is data used by skip connection.
+// If residual connection fusion is on, the formula is:
+// conv_output = conv_op(conv_filter, conv_input, conv_bias)
+//             + conv_residual_data
+// If the fusion is off, conv_residual_data is not added.
+struct Conv : public PatternBase {
+  Conv(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "convolution") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+
+// ElementwiseAdd used in residual connections.
+// y_var is used and convolution output.
+// The operator is removed, when residual
+// connection fusion is on.
+struct ElementwiseAdd : public PatternBase {
+  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise_add") {}
+
+  PDNode* operator()(PDNode* x_var);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_x);
+  PATTERN_DECL_NODE(elementwise_add_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index cadda49c399a6d65079cacedfea61f4fd580a69a..7ed2f96eb24239d87965192d73f4ba200ff5dbeb 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -124,7 +124,7 @@ TEST(GraphTest, Basic) {
       ASSERT_EQ(n->outputs.size(), 0UL);
     }
   }
-  ASSERT_EQ(nodes.size(), 5);
+  ASSERT_EQ(nodes.size(), 5UL);
 }
 
 TEST(GraphTest, WriteAfterRead) {
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+      n->Op()->SetAttr("use_mkldnn", true);
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mkldnn_placement_pass,
+              paddle::framework::ir::MKLDNNPlacementPass);
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn_placement_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d4dc9e2b6ecccddea4d63e45710c80d55ef2772
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class MKLDNNPlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd5b76426eb55cebdabfccd700439a4c418a10f0
--- /dev/null
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -0,0 +1,315 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static const char kNumRepeats[] = "num_repeats";
+typedef std::unordered_map<std::string, std::vector<ir::Node*>> SSAVarList;
+
+ir::Node* SameNameVar(std::unordered_set<ir::Node*> all, ir::Node* target) {
+  for (auto n : all) {
+    if (target->IsVar() && target->Name() == n->Name()) {
+      return n;
+    }
+  }
+  return nullptr;
+}
+
+VarDesc CopyVarDesc(VarDesc* var_desc) {
+  VarDesc repeated_var(var_desc->Name());
+  // copy other variable attributes
+  if (var_desc->GetType() != proto::VarType::READER) {
+    repeated_var.SetType(var_desc->GetType());
+    repeated_var.SetShape(var_desc->GetShape());
+    repeated_var.SetDataType(var_desc->GetDataType());
+    repeated_var.SetLoDLevel(var_desc->GetLoDLevel());
+    repeated_var.SetPersistable(var_desc->Persistable());
+  } else {
+    // TODO(typhoonzero): copy reader var
+  }
+  return repeated_var;
+}
+
+VarDesc UpdateGradVarDesc(
+    VarDesc* var_desc, int repeat,
+    const std::unordered_set<std::string>& grad_names,
+    const std::unordered_set<std::string>& bn_vars_need_rename) {
+  if (grad_names.find(var_desc->Name()) != grad_names.end() ||
+      bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) {
+    std::string new_gname =
+        string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
+    VarDesc repeated_var = CopyVarDesc(var_desc);
+    repeated_var.SetName(new_gname);
+    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
+    return repeated_var;
+  }
+  return *var_desc;
+}
+
+std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
+    std::unique_ptr<Graph> graph) const {
+  int num_repeats = Get<const int>(kNumRepeats);
+  std::vector<Node*> forward_backward_ops;
+  std::vector<Node*> optimize_ops;
+  std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
+  std::unordered_set<std::string> grad_names;
+
+  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+  auto origin_nodes = graph->ReleaseNodes();
+  VLOG(3) << "origin nodes count: " << origin_nodes.size();
+  ir::Graph& result = *graph;
+
+  // 1. record op nodes of different roles
+  for (auto node : nodes) {
+    if (node->IsVar()) continue;
+    int op_role = boost::get<int>(node->Op()->GetAttr(
+        framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+    if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
+        (op_role & static_cast<int>(framework::OpRole::kBackward)) ||
+        (op_role & static_cast<int>(framework::OpRole::kLoss))) {
+      forward_backward_ops.push_back(node);
+    } else if ((op_role & static_cast<int>(framework::OpRole::kOptimize)) ||
+               (op_role & static_cast<int>(framework::OpRole::kDist)) ||
+               (op_role & static_cast<int>(framework::OpRole::kRPC))) {
+      optimize_ops.push_back(node);
+      auto op_role_var = node->Op()->GetNullableAttr(
+          OpProtoAndCheckerMaker::OpRoleVarAttrName());
+      auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
+      for (size_t i = 0; i < op_role_vars.size(); i += 2) {
+        grad_names.insert(op_role_vars[i + 1]);
+      }
+    } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
+      lr_ops.push_back(node);
+    } else {  // NOLINT
+      PADDLE_THROW("Invalid op_role: %d", static_cast<int>(op_role));
+    }
+  }
+
+  // 2. copy forward backward
+  ir::Node* prev_repeat_last_op_node = nullptr;
+  // record origin_grad -> repeated grad list map.
+  std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
+  std::map<std::string, std::vector<ir::Node*>> created;
+  std::unordered_set<std::string> bn_vars_need_rename;
+  for (int i = 0; i < num_repeats; ++i) {
+    std::unordered_set<ir::Node*> copied;
+    for (size_t node_idx = 0; node_idx < forward_backward_ops.size();
+         ++node_idx) {
+      auto node = forward_backward_ops[node_idx];
+      OpDesc repeated_op(*(node->Op()), node->Op()->Block());
+      // 3. rename grad outputs to current repeat.
+      for (auto outname : repeated_op.OutputArgumentNames()) {
+        if (grad_names.find(outname) != grad_names.end()) {
+          std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
+          repeated_op.RenameOutput(outname, new_gname);
+        }
+      }
+      // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
+      // not need this update
+      if (node->Name() == "batch_norm") {
+        // NOTE: assume bn op created by layers use save var as output mean and
+        // variance
+        std::string new_mean_name =
+            string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i);
+        std::string new_var_name = string::Sprintf(
+            "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
+        bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
+        bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
+        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
+                << new_mean_name;
+        repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
+        repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
+        repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
+                                 new_mean_name);
+        repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0],
+                                 new_var_name);
+      }
+
+      // 3.9 do copy
+      auto repeated_node = result.CreateOpNode(&repeated_op);
+      copied.insert(node);
+
+      // 4. add deps between repeats
+      if (node_idx == forward_backward_ops.size() - 1) {
+        prev_repeat_last_op_node = repeated_node;
+      }
+      if (node_idx == 0 && prev_repeat_last_op_node) {
+        auto* depvar = result.CreateControlDepVar();
+        prev_repeat_last_op_node->outputs.push_back(depvar);
+        depvar->inputs.push_back(prev_repeat_last_op_node);
+        repeated_node->inputs.push_back(depvar);
+        depvar->outputs.push_back(repeated_node);
+      }
+
+      for (auto in_node : node->inputs) {
+        if (in_node->IsCtrlVar()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
+                                             bn_vars_need_rename);
+        // should be initialized by startup, how to initilize tensor in the
+        // scope?
+        if (node->Name() == "batch_norm" &&
+            bn_vars_need_rename.find(in_node->Name()) !=
+                bn_vars_need_rename.end()) {
+          // Create bn mean/variance for each repeat
+          var = result.CreateVarNode(&updated_var);
+          created[updated_var.Name()].push_back(var);
+          copied.insert(in_node);
+          repeated_node->inputs.push_back(var);
+          var->outputs.push_back(repeated_node);
+          continue;
+        }
+
+        // for other ops
+        if (in_node->inputs.empty() && i > 0) {
+          // do not copy head vars (inputs, params) in repeats > 0
+          var = created.at(in_node->Name()).back();
+        } else {
+          if (copied.find(in_node) == copied.end()) {
+            var = result.CreateVarNode(&updated_var);
+            if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) {
+              grad_repeated_map[in_node].push_back(var);
+            }
+            copied.insert(in_node);
+            created[updated_var.Name()].push_back(var);
+          } else {
+            var = created.at(updated_var.Name()).back();
+          }
+        }
+        repeated_node->inputs.push_back(var);
+        var->outputs.push_back(repeated_node);
+      }
+      for (auto out_node : node->outputs) {
+        if (out_node->IsCtrlVar()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names,
+                                             bn_vars_need_rename);
+        if (copied.find(out_node) == copied.end()) {
+          var = result.CreateVarNode(&updated_var);
+          if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) {
+            grad_repeated_map[out_node].push_back(var);
+          }
+          copied.insert(out_node);
+          created[updated_var.Name()].push_back(var);
+        } else {
+          var = created.at(updated_var.Name()).back();
+        }
+        repeated_node->outputs.push_back(var);
+        var->inputs.push_back(repeated_node);
+      }
+    }
+  }
+
+  // 5. create GRAD merge op node
+  for (auto kv : grad_repeated_map) {
+    OpDesc sum_op;
+    sum_op.SetType("sum");
+    std::vector<std::string> repeated_grad_names;
+    for (auto r : kv.second) {
+      repeated_grad_names.push_back(r->Var()->Name());
+    }
+    sum_op.SetInput("X", repeated_grad_names);
+    sum_op.SetOutput("Out", {kv.first->Var()->Name()});
+    sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                   static_cast<int>(OpRole::kBackward));
+    auto sum_op_node = result.CreateOpNode(&sum_op);
+    for (auto r : kv.second) {
+      sum_op_node->inputs.push_back(r);
+      r->outputs.push_back(sum_op_node);
+    }
+    auto sum_out_var_node = result.CreateVarNode(kv.first->Var());
+    sum_op_node->outputs.push_back(sum_out_var_node);
+    sum_out_var_node->inputs.push_back(sum_op_node);
+    created[sum_out_var_node->Name()].push_back(sum_out_var_node);
+
+    OpDesc scale_op;
+    scale_op.SetType("scale");
+    scale_op.SetInput("X", {sum_out_var_node->Var()->Name()});
+    // NOTE: inplace scale.
+    scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()});
+    scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                     static_cast<int>(OpRole::kBackward));
+    auto scale_op_node = result.CreateOpNode(&scale_op);
+    scale_op_node->inputs.push_back(sum_out_var_node);
+    sum_out_var_node->outputs.push_back(scale_op_node);
+    auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var());
+    scale_op_node->outputs.push_back(scale_out_var_node);
+    scale_out_var_node->inputs.push_back(scale_op_node);
+    created[scale_out_var_node->Name()].push_back(scale_out_var_node);
+  }
+  // 6. add optimize ops
+  {
+    auto copy_node = [&result, &created](ir::Node* node) {
+      auto op_node = result.CreateOpNode(node->Op());
+      // copy op ins/outs
+      // NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe
+      // dependencies, so create those depvars if OpDesc have in/outs.
+      for (auto in_node : node->inputs) {
+        if (in_node->IsCtrlVar() && !in_node->Var()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        if (created.find(in_node->Name()) == created.end()) {
+          var = result.CreateVarNode(in_node->Var());
+          created[in_node->Name()].push_back(var);
+        } else {
+          var = created.at(in_node->Name()).back();
+        }
+        op_node->inputs.push_back(var);
+        var->outputs.push_back(op_node);
+      }
+      for (auto out_node : node->outputs) {
+        if (out_node->IsCtrlVar() && !out_node->Var()) {
+          continue;
+        }
+        auto var = result.CreateVarNode(out_node->Var());
+        created[out_node->Name()].push_back(var);
+        op_node->outputs.push_back(var);
+        var->inputs.push_back(op_node);
+      }
+    };
+    for (auto node : lr_ops) {
+      copy_node(node);
+    }
+    for (auto node : optimize_ops) {
+      copy_node(node);
+    }
+  }
+
+  result.ResolveHazard(created);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass)
+    .RequirePassAttr(paddle::framework::ir::kNumRepeats);
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e5aef20dbc60c18ed03038818bfd8ab217bf28
--- /dev/null
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// BatchMergePass is used to copy forward and backward ops for several
+// times to run several batches to simulate large batch size training
+// as if we have more than 1 GPUs.
+// User can define how many batches to run, gradients will be merged
+// through those repeats, and then do optimization using merged gradients.
+// This pass is extremely useful when doing large batch-size distributed
+// sync training, we can simulate even large batch size as if we have more
+// GPUs.
+
+class BatchMergePass : public Pass {
+ public:
+  virtual ~BatchMergePass() {}
+
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 5d6da9f1d76a3c0fc64b7ff35264e385cf19a14b..d6d42f5e92080aa57445e2d6ce59aa3faa89d22d 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -44,6 +44,7 @@ class Node {
     return op_desc_.get();
   }
 
+  // Please don't use this API!
   int id() const { return id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
@@ -92,6 +93,7 @@ class Node {
   Node() = delete;
 
   static int count_;
+  // Please don't use this API or make this public.
   static void ResetId() { count_ = 0; }
   DISABLE_COPY_AND_ASSIGN(Node);
 };
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a1f65d274708dd208d7783c6273160c4c61738a
--- /dev/null
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X"))
+                  ->assert_is_op_input("sequence_conv")
+                  ->assert_var_not_persistable();
+  patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope);
+  fuse_pattern(x);
+
+  // Create New OpDesc
+  auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight,
+                          Node* eltadd_bias, Node* relu_out) {
+    OpDesc op_desc;
+    op_desc.SetType("fusion_seqconv_eltadd_relu");
+    op_desc.SetInput("X", {input->Name()});
+    op_desc.SetInput("Filter", {seqconv_weight->Name()});
+    op_desc.SetInput("Bias", {eltadd_bias->Name()});
+    op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength"));
+    op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
+    op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
+    op_desc.SetOutput("ColMat", {ColMat});
+    op_desc.SetOutput("Out", {relu_out->Name()});
+    scope->Var(ColMat)->GetMutable<LoDTensor>();
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(seqconv_weight, op);
+    IR_NODE_LINK_TO(eltadd_bias, op);
+    IR_NODE_LINK_TO(op, relu_out);
+    return op;
+  };
+
+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern);
+
+    fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias,
+                 relu_out);
+    std::unordered_set<const Node*> marked_nodes(
+        {seqconv, seqconv_out, eltadd, eltadd_out, relu});
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope());
+  AddStatis(fusion_count);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
+              paddle::framework::ir::SeqConvEltAddReluFusePass);
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac9de71930c1768bdf416520caae6468449cd3d
--- /dev/null
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SeqConvEltAddReluFusePass : public FusePassBase {
+ public:
+  virtual ~SeqConvEltAddReluFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 1e7da9a69c7cbf8c13306656599a759515802b76..669d08c70c9b7453264806b346a6c9eb211cfd4a 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
     PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
-      auto &offset = sub_lod.back();
+      size_t offset = sub_lod.back();
       for (size_t k = 1; k < lod[j].size(); ++k) {
         sub_lod.push_back(lod[j][k] + offset);
       }
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
index 6d7b6a4ada8729e3698dab5d2b1861aac632be79..36a5c3c5d601390beedaf37ceb98ee2c63ecf5a6 100644
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -18,6 +18,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 using LoDTensorArray = std::vector<LoDTensor>;
-}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 77386f4f069489b6ff7b927a281bdc286ff816e0..e1aac6dc5a92fb616f00de5806f044b83c2f503f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     this->reserve(this->size() + size_t(end - begin));
     this->insert(this->end(), begin, end);
   }
+
+  const T *CUDAData(platform::Place place) const {
+    PADDLE_THROW(
+        "Vector::CUDAData() method is not supported in CPU-only version");
+  }
+
+  T *CUDAMutableData(platform::Place place) {
+    PADDLE_THROW(
+        "Vector::CUDAMutableData() method is not supported in CPU-only "
+        "version");
+  }
+
+  const T *Data(platform::Place place) const {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::Data() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  T *MutableData(platform::Place place) {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::MutableData() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  const void *Handle() const { return static_cast<const void *>(this); }
 };
 
 template <typename T>
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2840d503f1454271afb309efdd435225ab077dc0..7fb42feb95b4d54aec693228721c052f683f4d80 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
   ops_.swap(ops);
 }
 
-void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
-  for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
-    auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
-    for (auto *op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-#else
-  LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
-#endif
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 9374f3f4a35cc0f90e5b2d6e8b397784b8eae123..ddfa6e1f4d8b73f594fc381ab505797491cdd378 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -48,8 +48,6 @@ class NaiveExecutor {
 
   void CleanFeedFetchOps();
 
-  void EnableMKLDNN(const ProgramDesc& program);
-
  protected:
   void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index b29ac44699463312a1fdcea55e003daa75997302..8ece618f3f72552fedcffab3e03ebb30476b7cab 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -85,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
       VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
-
     out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
 
@@ -423,8 +419,15 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     }
     VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
   }
+
   void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+
   void operator()(int64_t v) const { attr_->set_l(v); }
+
+  void operator()(const std::vector<int64_t> &v) const {
+    VectorToRepeated(v, attr_->mutable_longs());
+  }
+
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
@@ -519,20 +522,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 }
 
 void OpDesc::InferVarType(BlockDesc *block) const {
+  // There are a few places that var type can be set.
+  // When VarDesc is created, default set to LOD_TENSOR.
+  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // We limit here to be the only place that operator defines its customized
+  // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarType::LOD_TENSOR);
-      }
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index b4205aba83e774fb9c08193124adb93935c00157..30c8a26c3d2f0068674aa70b4ff875a2f73c1dca 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -100,16 +100,6 @@ class OpDesc {
   std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
   std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
 
-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
-
   const VariableNameMap &Inputs() const { return inputs_; }
 
   const VariableNameMap &Outputs() const { return outputs_; }
@@ -131,10 +121,6 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
-  const BlockDesc &BlockRef() const { return *this->block_; }
-
-  void SetBlock(BlockDesc *block) { this->block_ = block; }
-
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index df2a7a27ca4a6011b214202ac9bf4f30dc482ece..ca31303f77c4a30eb64c43404e214779ea78aeaf 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -21,7 +21,6 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
   validated_ = true;
   CheckNoDuplicatedInOutAttrs();
-  CheckReuseVars();
 }
 
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -40,40 +39,6 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
-void OpProtoAndCheckerMaker::Reuse(const std::string& name,
-                                   const std::string& reused_name) {
-  bool found = false;
-  proto::OpProto::Var* var;
-
-  for (auto& var : proto_->inputs()) {
-    if (var.name() == reused_name) {
-      found = true;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-
-  found = false;
-  for (int i = 0; i < proto_->outputs().size(); ++i) {
-    var = proto_->mutable_outputs()->Mutable(i);
-    if (var->name() == name) {
-      PADDLE_ENFORCE(!var->has_reuse(),
-                     "Output(%s) has been set reused var of %s", name,
-                     var->reuse());
-      found = true;
-      var->set_reuse(reused_name);
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-}
-
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
@@ -91,24 +56,6 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
-void OpProtoAndCheckerMaker::CheckReuseVars() {
-  std::unordered_set<std::string> names;
-  for (auto& input : proto_->inputs()) {
-    names.insert(input.name());
-  }
-  auto checker = [&](const std::string& name, const std::string& reused) {
-    PADDLE_ENFORCE(
-        names.count(reused),
-        "Output [%s] reuse Input [%s], but the input is not registered.", name,
-        reused);
-  };
-  for (auto& output : proto_->outputs()) {
-    if (output.has_reuse()) {
-      checker(output.name(), output.reuse());
-    }
-  }
-}
-
 void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                         OpAttrChecker* attr_checker) {
   proto_ = proto;
@@ -124,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
            static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kLoss) |
                static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
            static_cast<int>(OpRole::kNotSpecified)})
       .SetDefault(static_cast<int>(OpRole::kNotSpecified));
   AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 4ed3cc45d66849267ef4945a03da1db76b53e4ea..4c59c73d8779eceb267ad532aabccabbd54b0df2 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,25 +14,26 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include <unordered_set>
-
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
   // RPC role is for send/recv releated op
-  kRPC = 0x0003,
+  kRPC = 0x0004,
   // Dist role is for split_byref/split_selected_rows/concat
   // used for distributed training.
-  kDist = 0x0004,
+  kDist = 0x0008,
   // Tag all learning rate scheduler operators.
-  kLRSched = 0x0005,
+  kLRSched = 0x0010,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
@@ -73,11 +74,6 @@ class OpProtoAndCheckerMaker {
       var_->set_dispensable(true);
       return *this;
     }
-
-    VariableBuilder &Reuse(const std::string &name) {
-      var_->set_reuse(name);
-      return *this;
-    }
   };
 
   VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -85,8 +81,6 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
-  void Reuse(const std::string &name, const std::string &reused_name);
-
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
@@ -105,8 +99,6 @@ class OpProtoAndCheckerMaker {
   void CheckNoDuplicatedInOutAttrs();
   void Validate();
 
-  void CheckReuseVars();
-
   proto::OpProto *proto_;
   OpAttrChecker *op_checker_;
   bool validated_{false};
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index b71c7b646857e11f291748c4c7c2af92b6d53231..a8030d377fdb4d4aef74b315e21792dad10fac96 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -47,120 +47,3 @@ TEST(ProtoMaker, DuplicatedInOut) {
   ASSERT_THROW(proto_maker(&op_proto, &op_checker),
                paddle::platform::EnforceNotMet);
 }
-
-class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-  }
-};
-
-class TestInplaceProtoMaker2
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-    AddOutput("NoOut", "output of test op").Reuse("NotExists");
-  }
-};
-
-TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto, op_proto2;
-  paddle::framework::OpAttrChecker op_checker;
-  TestInplaceProtoMaker proto_maker;
-  TestInplaceProtoMaker2 proto_maker2;
-
-  proto_maker(&op_proto, &op_checker);
-
-  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-
-// normal reuse
-class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddInput("Y", "input of test op");
-    AddOutput("Out", "output of test op");
-    AddOutput("XOut", "output of test op");
-    // avoid destructor exception.
-    // Validate();
-    TestReuse();
-  }
-
-  virtual void TestReuse() {}
-};
-
-// test duplicate reuse error
-class TestReuseProtoMaker2 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "X");
-    Reuse("Out", "Y");
-  }
-};
-
-// NotExists Input
-class TestReuseProtoMaker3 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "NotExists");
-    Reuse("XOut", "X");
-  }
-};
-
-// NotExists Output
-class TestReuseProtoMaker4 : public TestReuseProtoMaker {
- public:
-  void TestReuse() { Reuse("NotExists", "X"); }
-};
-
-TEST(ProtoMaker, Reuse) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker proto_maker;
-  proto_maker(&op_proto, &op_checker);
-}
-
-// NOTE(dzhwinter):
-// There is a Fatal CHECK on base class destructor, which will call abort inside
-// instead of
-// throw an exception. If we throw an exception in Make(), we will trigger the
-// CHECK and terminate the tests.
-//
-// I had tried to replace the default CHECK with a exception, however, it's
-// still not supported by glog.
-// the details:
-// https://github.com/google/glog/issues/249
-// https://github.com/facebookresearch/TensorComprehensions/issues/351
-/*
-TEST(ProtoMaker, ReuseWithException) {
-  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker2 proto_maker2;
-  TestReuseProtoMaker3 proto_maker3;
-  TestReuseProtoMaker4 proto_maker4;
-  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-
-  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
-               paddle::platform::EnforceNotMet);
-
-  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-
-void FailureFunction() {
-  throw std::runtime_error("Check failed in destructor.");
-  // return 0;
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  google::InstallFailureFunction(&FailureFunction);
-  return RUN_ALL_TESTS();
-}
-*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9f930065324f13f5aa79c214e820fb6fc2f3a166..45fc36c70633204dbfadbd10757c08b009d2cc74 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  RunImpl(scope, place);
+
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
+  }
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
@@ -346,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-static bool VarIsTensor(const Variable* var) {
-  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+static bool VarIsTensor(const Variable& var) {
+  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
 
-static const Tensor* GetTensorFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
+const Tensor* GetTensorFromVar(const Variable& var) {
+  if (var.IsType<LoDTensor>()) {
+    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
+  } else if (var.IsType<SelectedRows>()) {
+    return &(var.Get<SelectedRows>().value());
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 var.Type().name());
   }
 }
 
@@ -407,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
-  return var == nullptr ? nullptr
-                        : GetTensorFromVar(const_cast<Variable*>(var));
+  return var == nullptr ? nullptr : GetTensorFromVar(*var);
 }
 
 template <>
@@ -420,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::transform(names.begin(), names.end(), std::back_inserter(res),
                  [&](const std::string& sub_name) {
                    auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                   return var == nullptr ? nullptr : GetTensorFromVar(*var);
                  });
   return res;
 }
@@ -762,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor =
-        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    auto* var = transfer_scope.FindVar(var_name);
+    PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
+                   var_name);
+    auto* transformed_tensor = GetTensorFromVar(*var);
     original_tensor->ShareDataWith(*transformed_tensor);
   }
 }
@@ -776,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
     for (auto& var_name : var_name_item.second) {
       auto* var = scope.FindVar(var_name);
       // Only tensor can be tranfer to another device.
-      if (var == nullptr || !VarIsTensor(var)) {
+      if (var == nullptr || !VarIsTensor(*var)) {
         continue;
       }
 
-      auto* tensor_in = GetTensorFromVar(var);
+      auto* tensor_in = GetTensorFromVar(*var);
       if (!tensor_in->IsInitialized()) {
         continue;
       }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 626b50edfd39424473be33e9f8baec5970471477..96ad3205235b921a7cf60ed674a8350f74d18509 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+const Tensor* GetTensorFromVar(const Variable& var);
 
 class OperatorBase;
 class ExecutionContext;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f06bad6c78c05804e583f859906b88fb7b500372..a45b9ec7a20ac3629d182f009b735d4d82fb5dc2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor(
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-  // Startup Program has been run. All local scopes has correct parameters.
+// Startup Program has been run. All local scopes has correct parameters.
 
-  // Step 2. Create vars in each scope;
-  std::vector<details::VariableInfo> var_infos;
-  for (auto *var : main_program.Block(0).AllVars()) {
-    var_infos.emplace_back();
-    var_infos.back().name_ = var->Name();
-    var_infos.back().type_ = var->GetType();
-    var_infos.back().persistable_ = var->Persistable();
-  }
-
-// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
@@ -156,13 +147,22 @@ ParallelExecutor::ParallelExecutor(
                            params, member_->local_scopes_, member_->use_cuda_);
 #endif
 
-  if (VLOG_IS_ON(5)) {
-    // If the loss_var_name is given, the number of graph should be only one.
-    if (loss_var_name.size()) {
-      PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                        "The number of graph should be only one");
+  // Step 3. Create vars in each scope. Passes may also create new vars.
+  //         skip control vars and empty vars
+  std::vector<details::VariableInfo> var_infos;
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
+  // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
+  }
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
     member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
@@ -187,6 +187,10 @@ void ParallelExecutor::BCastParamsToDevices(
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
@@ -299,6 +303,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 
 ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+
   if (member_->own_local_scope_) {
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
       Scope *local_scope = member_->local_scopes_[i];
@@ -307,6 +315,10 @@ ParallelExecutor::~ParallelExecutor() {
       }
     }
   }
+
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
+  member_.reset();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index fd386a5987f11ff64964e95eb7e9b83572dc790c..ef09b98b2aa91a9d729b94d15dbb676dde4092b6 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
 
 #ifdef PADDLE_WITH_CUDA
   // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 589905828f7793c614c0fe12259e9ba5ab11ceac..4b9667113bc7918c1323f0213213a6ffdb7eed8e 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -126,7 +126,7 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
   std::vector<std::string> feed_target_names;
   for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFeedOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
       if (col >= feed_target_names.size()) {
         feed_target_names.resize(col + 1);
       }
@@ -143,7 +143,7 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
   std::vector<std::string> fetch_target_names;
   for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFetchOpType) {
-      int col = boost::get<int>(op->GetAttr("col"));
+      size_t col = boost::get<int>(op->GetAttr("col"));
       if (col >= fetch_target_names.size()) {
         fetch_target_names.resize(col + 1);
       }
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 7e689a37da8a16bd9b1ac6650b9322d2eb5a2c85..48bde2785e6a51afc0d2905ac31fe20a3c3019b6 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) {
       ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
       found_sub_block = true;
 
-      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size());
       found_sub_blocks = true;
     }
   }
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
index f0d07cb7c1367576084b9494e7758103bb45d1e5..d812417a38200bcfdbdeac78800190647510a144 100644
--- a/paddle/fluid/framework/reader_test.cc
+++ b/paddle/fluid/framework/reader_test.cc
@@ -39,8 +39,8 @@ TEST(READER, decorate_chain) {
   {
     auto endpoints = root->GetEndPoints();
     ASSERT_EQ(endpoints.size(), 2U);
-    ASSERT_NE(endpoints.count(end_point1.get()), 0);
-    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0UL);
   }
 
   {
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 1a727a2c8c759d010606d5b605823b7252b35c69..a4abd1b1283f08fb8431fbeea0cea17c8439fdd7 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
+
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e42fff1d79d92fb7ed61768a614d8cd98f6775a0..9462620e829ec815e1553f6378a67463ea3b8aa3 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
+
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
@@ -73,6 +78,8 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  std::list<Scope*>& kids() const { return kids_; }
+
   /// Find if a scope exists in the kid scopes
   bool HasKid(const Scope* scope) const;
 
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 928e1ad8b9168e61ddc5782066a4aa29a4296a94..9c427a4ae4c9660b107ca891a60db306cb09301f 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) {
   ASSERT_TRUE(table.HasKey(10));
   ASSERT_TRUE(table.HasKey(8));
   ASSERT_TRUE(table.HasKey(6));
-  ASSERT_EQ(table.rows().size(), 3);
+  ASSERT_EQ(table.rows().size(), 3UL);
 
   framework::Tensor ids;
   ids.Resize(framework::make_ddim({4}));
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index cb2061c06a429d8e8116001a4aa4e8c46ea13428..a0a9a573603ceb6b577529101cb331adbc81337a 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
                                         platform::CPUPlace());
     EXPECT_EQ(p1, p2);
   }
+  // Not sure if it's desired, but currently, Tensor type can be changed.
+  {
+    framework::Tensor src_tensor;
+    int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
+                                                 platform::CPUPlace());
+    EXPECT_NE(p1, nullptr);
+    *p1 = 1;
+
+    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
+                                                   platform::CPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_EQ(static_cast<int>(p2[0]), 1);
+  }
 
 #ifdef PADDLE_WITH_CUDA
   {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d7a2eb5b38255531880fe3d2e5321024caf0c6b..ca1e01c89f07c4ffc3979a6a6c3728328e0a1819 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -130,9 +145,20 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cuda_pinned_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
+                 nullptr);
   }
 #endif
 }
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index a1e5b967a86d10f3439db662af54bb82888027b9..793ccfc79fe56707f226477b9d50b1d972ab6a59 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
   Tensor slice_tensor = src_tensor.Slice(1, 2);
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
+
     Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 18cdca3a658a6a89e6ab637a7f38825756acfea8..a588cb417aebe94bd4aeda02b1bc8ba07a04b960 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,
 
 namespace paddle {
 namespace framework {
-
 std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
 std::once_flag ThreadPool::init_flag_;
 
@@ -47,8 +46,7 @@ void ThreadPool::Init() {
   }
 }
 
-ThreadPool::ThreadPool(int num_threads)
-    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+ThreadPool::ThreadPool(int num_threads) : running_(true) {
   threads_.resize(num_threads);
   for (auto& thread : threads_) {
     // TODO(Yancey1989): binding the thread on the specify CPU number
@@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
 ThreadPool::~ThreadPool() {
   {
     // notify all threads to stop running
+    std::lock_guard<std::mutex> l(mutex_);
     running_ = false;
     scheduled_.notify_all();
   }
@@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
   }
 }
 
-void ThreadPool::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_.wait(lock, [=] { return Done() == true; });
-}
-
 void ThreadPool::TaskLoop() {
-  while (running_) {
+  while (true) {
     std::unique_lock<std::mutex> lock(mutex_);
-    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
 
-    if (!running_) {
-      break;
+    scheduled_.wait(
+        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+
+    if (!running_ || tasks_.empty()) {
+      return;
     }
+
     // pop a task from the task queue
     auto task = std::move(tasks_.front());
     tasks_.pop();
-
-    --idle_threads_;
     lock.unlock();
 
     // run the task
     task();
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      ++idle_threads_;
-      if (Done()) {
-        completed_.notify_all();
-      }
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 94111ee335b1a5df327b3e46d62069b4735c54f6..0687e628aaa4fb7b2e67938fa09a319c8bb35aff 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -57,15 +57,6 @@ class ThreadPool {
 
   ~ThreadPool();
 
-  // Returns the number of threads created by the constructor.
-  size_t Threads() const { return total_threads_; }
-
-  // Returns the number of currently idle threads.
-  size_t IdleThreads() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return idle_threads_;
-  }
-
   // Run pushes a function to the task queue and returns a std::future
   // object.  To wait for the completion of the task, call
   // std::future::wait().
@@ -94,25 +85,13 @@ class ThreadPool {
     });
     std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
     tasks_.push(std::move(task));
-    lock.unlock();
     scheduled_.notify_one();
     return f;
   }
 
-  // Wait until all the tasks are completed.
-  void Wait();
-
  private:
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  // If the task queue is empty and avaialbe is equal to the number of
-  // threads, means that all tasks are completed.  Note: this function
-  // is not thread-safe.  Returns true if all tasks are completed.
-  // Note: don't delete the data member total_threads_ and use
-  // threads_.size() instead; because you'd need to lock the mutex
-  // before accessing threads_.
-  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
-
   // The constructor starts threads to run TaskLoop, which retrieves
   // and runs tasks from the queue.
   void TaskLoop();
@@ -125,14 +104,11 @@ class ThreadPool {
   static std::once_flag init_flag_;
 
   std::vector<std::unique_ptr<std::thread>> threads_;
-  const size_t total_threads_;
-  size_t idle_threads_;
 
   std::queue<Task> tasks_;
   std::mutex mutex_;
   bool running_;
   std::condition_variable scheduled_;
-  std::condition_variable completed_;
 };
 
 class ThreadPoolIO : ThreadPool {
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 27a4ffd4fcbf293a3dea1744b29384d0bee0c137..884d61e23428a0ad758946295ca9c470767e93ef 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -19,10 +19,11 @@ limitations under the License. */
 
 namespace framework = paddle::framework;
 
-void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
-  std::vector<std::future<void>> fs;
+void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
+            std::atomic<int>* sum, int cnt) {
   for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
+    std::lock_guard<std::mutex> l(*mu);
+    fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
   }
 }
 
@@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
 }
 
 TEST(ThreadPool, ConcurrentRun) {
-  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
   std::atomic<int> sum(0);
   std::vector<std::thread> threads;
+  std::vector<std::future<void>> fs;
+  std::mutex fs_mu;
   int n = 50;
   // sum = (n * (n + 1)) / 2
   for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, &sum, i);
+    std::thread t(do_sum, &fs, &fs_mu, &sum, i);
     threads.push_back(std::move(t));
   }
   for (auto& t : threads) {
     t.join();
   }
-  pool->Wait();
+  for (auto& t : fs) {
+    t.wait();
+  }
   EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index e099e40f121ff13657e563eb608feecbca0551be..2de6233a9e0d320ec9a06d547db3575eb61925c0 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -36,7 +36,7 @@ using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
                    std::vector<bool>, BlockDesc*, int64_t,
-                   std::vector<BlockDesc*>>;
+                   std::vector<BlockDesc*>, std::vector<int64_t>>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9794a193bcfaae19552b1f6fbdf2dab2898033d5..d31c8e3b7d66a0cdb2c4725783c9a24f049c666d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,3 +1,6 @@
+if(WITH_TESTING)
+  include(test.cmake) # some generic cmake funtion for inference
+endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
@@ -30,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -40,7 +43,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api)
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index d4d2fd4634f9e11f3f002e11e177c332ced49885..0354f9e6e9588af601210b8a71ae98c1f90d62f0 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
-function (inference_analysis_test TARGET)
-    if(WITH_TESTING)
-        set(options "")
-        set(oneValueArgs "")
-        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        set(mem_opt "")
-        if(WITH_GPU)
-            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-        endif()
-        cc_test(${TARGET}
-                SRCS "${analysis_test_SRCS}"
-                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
-        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
-    endif(WITH_TESTING)
+function(inference_analysis_test TARGET)
+  if(WITH_TESTING)
+     set(options "")
+     set(oneValueArgs "")
+     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
+     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+     inference_base_test(${TARGET}
+             SRCS ${analysis_test_SRCS}
+             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
+  endif()
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index d780592eb9f79e39e34fcd3bd6b086992eaa931f..ef4142f334e503380dc7ccd74c348404ffe52ee6 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -101,7 +101,16 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 
 void Analyzer::Run(Argument* argument) {
   std::vector<std::string> passes;
-  for (auto& pass : all_ir_passes_) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (use_mkldnn_) {
+    VLOG(3) << "Adding MKL-DNN placement pass";
+    passes.push_back("mkldnn_placement_pass");
+  }
+#endif
+  // infer_clean_graph_pass should be the first default pass
+  // after mkldnn_placement_pass.
+  passes.push_back("infer_clean_graph_pass");
+  for (auto& pass : ir_passes_) {
     if (!disabled_ir_passes_.count(pass)) {
       passes.push_back(pass);
       passes.push_back("graph_viz_pass");  // add graphviz for debug.
@@ -117,11 +126,26 @@ void Analyzer::Run(Argument* argument) {
   }
 }
 
+Analyzer& Analyzer::IncludeAllIrPasses() {
+  ir_passes_ = all_ir_passes_;
+  return *this;
+}
+
 Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
   disabled_ir_passes_.insert(passes.begin(), passes.end());
   return *this;
 }
 
+Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
+  ir_passes_ = passes;
+  return *this;
+}
+
+Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
+  use_mkldnn_ = use_mkldnn;
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 765145cb7da44ca13c5394ad1dc2e879e69d69d1..3af1d572dfd81197797dd7e57d87ba12c2f3548e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
   void Run(Argument* argument);
 
   Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
+  Analyzer& IncludeAllIrPasses();
+  Analyzer& SetUseMkldnn(bool use_mkldnn);
 
   DISABLE_COPY_AND_ASSIGN(Analyzer);
 
@@ -64,23 +67,29 @@ class Analyzer : public OrderedRegistry<PassManager> {
   // larger fusion.
   const std::vector<std::string> all_ir_passes_{{
       // Manual update the passes here.
-      "infer_clean_graph_pass",        //
-      "attention_lstm_fuse_pass",      //
-      "embedding_fc_lstm_fuse_pass",   //
-      "fc_lstm_fuse_pass",             //
-      "mul_lstm_fuse_pass",            //
-      "fc_gru_fuse_pass",              //
-      "mul_gru_fuse_pass",             //
-      "seq_concat_fc_fuse_pass",       //
-      "fc_fuse_pass",                  //
-      "conv_bn_fuse_pass",             //
-      "conv_eltwiseadd_bn_fuse_pass",  //
+      "attention_lstm_fuse_pass",       //
+      "seqconv_eltadd_relu_fuse_pass",  //
+      "embedding_fc_lstm_fuse_pass",    //
+      "fc_lstm_fuse_pass",              //
+      "mul_lstm_fuse_pass",             //
+      "fc_gru_fuse_pass",               //
+      "mul_gru_fuse_pass",              //
+      "seq_concat_fc_fuse_pass",        //
+      "fc_fuse_pass",                   //
+      "conv_bn_fuse_pass",              //
+      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
-      "conv_relu_mkldnn_fuse_pass",  //
+      "depthwise_conv_mkldnn_pass",             //
+      "conv_bias_mkldnn_fuse_pass",             //
+      "conv_relu_mkldnn_fuse_pass",             //
+      "conv_elementwise_add_mkldnn_fuse_pass",  //
 #endif
   }};
 
   std::unordered_set<std::string> disabled_ir_passes_;
+  // Ir passes to run
+  std::vector<std::string> ir_passes_;
+  bool use_mkldnn_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index f90910ac0d0a897ef01d4ca2bd0bca575baf4c40..5430e5c1ef1c70d27295ebc1a9bd427cd95f006a 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
   config.model_dir = model_path;
   config.use_gpu = false;
   config.device = 0;
-  auto predictor =
-      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-          config);
+  auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);
 
   // One single batch
 
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 1682011c3d8cc9927a4b026b370671798cace625..50ce20621fb289023ecccf7bb39d98169765d5ee 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
   op->SetType(type);
   op->SetInput("Xs", inputs);
   op->SetOutput("Xs", outputs);
+  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(framework::OpRole::kForward));
 }
 
 TEST(DataFlowGraph, Build_IR_Graph) {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 0ddd5d53f836131fe37d412fc867cb38f11ee2b5..49a9ebe3ddec1e4fd59ae1155a706859e249d25c 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -17,32 +17,14 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-
 set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()
 
-function(inference_api_test TARGET_NAME)
-    if (WITH_TESTING)
-        set(options "")
-        set(oneValueArgs SRC)
-        set(multiValueArgs ARGS)
-        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-        cc_test(${TARGET_NAME}
-                SRCS ${inference_test_SRC}
-                DEPS "${inference_deps}"
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-        if(inference_test_ARGS)
-            set_tests_properties(${TARGET_NAME}
-                    PROPERTIES DEPENDS "${inference_test_ARGS}")
-        endif()
-    endif(WITH_TESTING)
-endfunction(inference_api_test)
-
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
@@ -50,10 +32,11 @@ cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
 
-inference_api_test(test_api_impl SRC api_impl_tester.cc
-                    ARGS test_word2vec test_image_classification)
-
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+if(WITH_TESTING)
+  inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
+                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
+  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
+endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
         ARGS --dirname=${PYTHON_TESTS_DIR}/book)
 
@@ -61,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
         SRCS api_tensorrt_subgraph_engine.cc
         DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
-
-inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
+  if(WITH_TESTING)
+    inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
+                      ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  endif()
 endif()
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3bc6af5241c41bd805699121d614d431d46d863f..54c37fe64590aa82d7100c93c4c5c4ee36491421 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -25,9 +25,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
     LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
@@ -72,15 +77,12 @@ bool AnalysisPredictor::Init(
     inference_program_ = program;
   }
 
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
-  }
-
   executor_->Prepare(scope_.get(), *inference_program_, 0,
                      config_.use_feed_fetch_ops);
 
   // Get the feed_target_names and fetch_target_names
   PrepareFeedFetch();
+
   return true;
 }
 
@@ -108,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
@@ -220,10 +226,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   argument_.origin_program_desc.reset(
       new ProgramDesc(*inference_program_->Proto()));
-  PADDLE_ENFORCE(
-      config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
-      "Only kExclude is supported yet.");
-  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
+  switch (config_.ir_mode) {
+    case contrib::AnalysisConfig::IrPassMode::kExclude:
+      Analyzer()
+          .IncludeAllIrPasses()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .DisableIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    case contrib::AnalysisConfig::IrPassMode::kInclude:
+      Analyzer()
+          .SetUseMkldnn(config_._use_mkldnn)
+          .IncludeIrPasses(config_.ir_passes)
+          .Run(&argument_);
+      break;
+    default:
+      LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
+  }
 
   CHECK(argument_.transformed_program_desc);
   VLOG(5) << "to prepare executor";
@@ -307,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
@@ -335,6 +358,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
   }
   return true;
 }
+
+AnalysisPredictor::~AnalysisPredictor() {
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
+#endif
+  if (sub_scope_) {
+    scope_->DeleteScope(sub_scope_);
+  }
+}
+
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00..b7dc2067332278c1c38df4beefb5059efe76417f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -72,6 +73,7 @@ class AnalysisPredictor : public PaddlePredictor {
   template <typename T>
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
+  ~AnalysisPredictor();
 
  private:
   contrib::AnalysisConfig config_;
@@ -87,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 1d25f55b3188a684fe38df1417d114348cfa2e8a..13c25da1b52742e6114b294847c21ce735b9fc21 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
   config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
   config.use_feed_fetch_ops = false;
 
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   auto w0 = predictor->GetInputTensor("firstw");
   auto w1 = predictor->GetInputTensor("secondw");
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 6682e0a81b20c82aa668a249d37986386d769c83..d06ab8f8c8e3c0adf4a4074eb1450012126e83ea 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,10 +22,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -72,6 +75,9 @@ bool NativePaddlePredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {
@@ -152,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 7882f6a53c7ce9a2486158ea9b50c018d1814091..4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
   std::vector<framework::OpDesc *> fetchs_;
   // Do not use unique_ptr, use parent scope to delete
   framework::Scope *sub_scope_{nullptr};
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index bed7c871311e476b4ed113d982c690383ad732de..5152b8670ddb206f0927c03149684af4a096df42 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -27,7 +27,9 @@ limitations under the License. */
 #define ACC_DIFF 1e-3
 #endif
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(word2vec_dirname, "",
+              "Directory of the word2vec inference model.");
+DEFINE_string(book_dirname, "", "Directory of the book inference model.");
 
 namespace paddle {
 
@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
 
 NativeConfig GetConfig() {
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.model_dir = FLAGS_word2vec_dirname;
   LOG(INFO) << "dirname  " << config.model_dir;
   config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
   NativeConfig config = GetConfig();
   config.use_gpu = use_gpu;
   config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
   const bool is_combined = false;
   std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_jobs; ++tid) {
     threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
       auto& local_inputs = paddle_tensor_feeds[tid];
       std::vector<PaddleTensor> local_outputs;
       ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -205,7 +207,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
       float* ref_data = refs[tid].data<float>();
       EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
       for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
+        EXPECT_NEAR(ref_data[i], data[i], 2e-3);
       }
     });
   }
@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
   NativeConfig config = GetConfig();
   config.use_gpu = use_gpu;
   config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
   std::vector<framework::LoDTensor> jobs(num_jobs);
@@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) {
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_jobs; ++tid) {
     threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
       auto& local_inputs = paddle_tensor_feeds[tid];
       std::vector<PaddleTensor> local_outputs;
       ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -271,7 +273,7 @@ TEST(inference_api_native, word2vec_cpu_threads) {
   MainThreadsWord2Vec(false /*use_gpu*/);
 }
 TEST(inference_api_native, image_classification_cpu) {
-  MainThreadsImageClassification(false /*use_gpu*/);
+  MainImageClassification(false /*use_gpu*/);
 }
 TEST(inference_api_native, image_classification_cpu_threads) {
   MainThreadsImageClassification(false /*use_gpu*/);
@@ -279,15 +281,17 @@ TEST(inference_api_native, image_classification_cpu_threads) {
 
 #ifdef PADDLE_WITH_CUDA
 TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
-TEST(inference_api_native, word2vec_gpu_threads) {
-  MainThreadsWord2Vec(true /*use_gpu*/);
-}
+// Turn off temporarily for the unstable result.
+// TEST(inference_api_native, word2vec_gpu_threads) {
+//   MainThreadsWord2Vec(true /*use_gpu*/);
+// }
 TEST(inference_api_native, image_classification_gpu) {
-  MainThreadsImageClassification(true /*use_gpu*/);
-}
-TEST(inference_api_native, image_classification_gpu_threads) {
-  MainThreadsImageClassification(true /*use_gpu*/);
+  MainImageClassification(true /*use_gpu*/);
 }
+// Turn off temporarily for the unstable result.
+// TEST(inference_api_native, image_classification_gpu_threads) {
+//   MainThreadsImageClassification(true /*use_gpu*/);
+// }
 
 #endif
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index fc6310e90b0257bc84742fb617a00f5778bb1866..89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -29,23 +29,20 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
 
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config0;
-  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.model_dir = FLAGS_dirname;
   config0.use_gpu = true;
   config0.fraction_of_gpu_memory = 0.3;
   config0.device = 0;
 
   MixedRTConfig config1;
-  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.model_dir = FLAGS_dirname;
   config1.use_gpu = true;
   config1.fraction_of_gpu_memory = 0.3;
   config1.device = 0;
   config1.max_batch_size = 10;
 
-  auto predictor0 =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
 
   for (int batch_id = 0; batch_id < 1; batch_id++) {
     //# 2. Prepare input.
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index ec8471ef960a2fc44af23c52be09cd678fab3f70..49683eab07a2f5bc008272038a27bdb277396284 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
@@ -61,8 +62,8 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
-if (NOT WIN32) 
-  if (USE_TENSORRT AND WITH_GPU) 
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
       include_directories("${TENSORRT_INCLUDE_DIR}")
       link_directories("${TENSORRT_LIB_DIR}")
   endif()
@@ -77,13 +78,14 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/paddle/fluid/inference")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
@@ -97,17 +99,17 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
-    glog gflags protobuf snappystream snappy z
+    glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
 set(DEPS ${DEPS}
@@ -120,7 +122,7 @@ endif(NOT WIN32)
 
 if(WITH_GPU)
   if(NOT WIN32)
-    if (USE_TENSORRT) 
+    if (USE_TENSORRT)
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
     endif()
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 65c95f0834a9356fc14faed8342f5d1e474edf8f..ff718077c1ba6b10fe87aac10d84f96a23ad6bba 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -5,22 +5,23 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
 
 cd `dirname $0`
 current_dir=`pwd`
 if [ $2 == ON ]; then
   # You can export yourself if move the install path
-  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
 fi
 if [ $3 == ON ]; then
   use_gpu_list='true false'
-else    
+else
   use_gpu_list='false'
 fi
 
 USE_TENSORRT=OFF
-if [ [-d"$TENSORRT_INCLUDE_DIR"] -a [-d"$TENSORRT_LIB_DIR"] ]; then
+if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -55,13 +56,13 @@ cd build
 for WITH_STATIC_LIB in ON OFF; do
   # -----simple_on_word2vec-----
   rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=simple_on_word2vec \
     -DWITH_GPU=$TEST_GPU_CPU \
     -DWITH_STATIC_LIB=$WITH_STATIC_LIB
   make -j
-  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
   if [ -d $word2vec_model ]; then
     for use_gpu in $use_gpu_list; do
       ./simple_on_word2vec \
@@ -75,14 +76,14 @@ for WITH_STATIC_LIB in ON OFF; do
   fi
   # ---------vis_demo---------
   rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=vis_demo \
     -DWITH_GPU=$TEST_GPU_CPU \
     -DWITH_STATIC_LIB=$WITH_STATIC_LIB
   make -j
   for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do 
+    for vis_demo_name in $vis_demo_list; do
       ./vis_demo \
         --modeldir=$DATA_DIR/$vis_demo_name/model \
         --data=$DATA_DIR/$vis_demo_name/data.txt \
@@ -94,11 +95,11 @@ for WITH_STATIC_LIB in ON OFF; do
       fi
     done
   done
-  
+
   # --------tensorrt mobilenet------
   if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
     rm -rf *
-    cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=trt_mobilenet_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
@@ -106,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do
       -DUSE_TENSORRT=$USE_TENSORRT \
       -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
       -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-    make -j 
+    make -j
     ./trt_mobilenet_demo \
       --modeldir=$DATA_DIR/mobilenet/model \
       --data=$DATA_DIR/mobilenet/data.txt \
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 8058d7e881025b1d806efe187d4523adadff367d..5446fd4d4256c10442a53ea09a447cf308cbd681 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
 
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -42,8 +42,7 @@ void Main(bool use_gpu) {
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     //# 2. Prepare input.
@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) {
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
-  auto main_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_threads; ++tid) {
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index ffb12b5871f088f15e43a1b0ff7e2a8b2f5fd079..4a8404f21c6ec6a1647e964ac3538b4b49151009 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "utils.h"  // NOLINT
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DEFINE_string(modeldir, "", "Directory of the inference model.");
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index 4792c97fe7d0a3f9c904774ad4a8e580cefcf237..d70c6aea791219a40c3164b51499f9d5e562be71 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -18,7 +18,7 @@
 #include <iostream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
 namespace demo {
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index db61786e2fefda29256d84b5357028ec9c39b014..8d546e3e9c740c10bcf2984e073c956e3612625c 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "utils.h"  // NOLINT
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -34,12 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
 
+using contrib::AnalysisConfig;
 /*
- * Use the native fluid engine to inference the demo.
+ * Use the native and analysis fluid engine to inference the demo.
  */
 void Main(bool use_gpu) {
-  std::unique_ptr<PaddlePredictor> predictor;
-  NativeConfig config;
+  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
+  AnalysisConfig config;
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
   config.use_gpu = use_gpu;
@@ -49,8 +50,8 @@ void Main(bool use_gpu) {
   }
 
   VLOG(3) << "init predictor";
-  predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  predictor = CreatePaddlePredictor<NativeConfig>(config);
+  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   VLOG(3) << "begin to process data";
   // Just a single batch of data.
@@ -68,7 +69,7 @@ void Main(bool use_gpu) {
   input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
-  std::vector<PaddleTensor> output;
+  std::vector<PaddleTensor> output, analysis_output;
   predictor->Run({input}, &output, 1);
 
   VLOG(3) << "output.size " << output.size();
@@ -77,6 +78,10 @@ void Main(bool use_gpu) {
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
+
+  // the analysis_output has some diff with native_output,
+  // TODO(luotao): add CheckOutput for analysis_output later.
+  analysis_predictor->Run({input}, &analysis_output, 1);
 }
 
 }  // namespace demo
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ae6c6dc9f44650c1c62f5be5448864d817513b1
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+
+namespace paddle {
+namespace details {
+
+// Should be called after the parameters are loaded.
+void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
+  if (flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      // TODO(Superjomn) should avoid the case when a TensorArray is a
+      // parameter.
+      if (var_name == "feed" || var_name == "fetch") continue;
+      if (var->Type() == typeid(framework::LoDTensorArray)) {
+        VLOG(4) << "collect " << var_name;
+        arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
+      }
+    }
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+
+    VLOG(3) << "Collect " << arrays_.size() << " arrays";
+    flag_ = false;
+  }
+}
+
+// Should be called when `Run` finished.
+void TensorArrayBatchCleaner::ResetTensorArray() {
+  for (auto *arr : arrays_) {
+    arr->clear();
+  }
+}
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..a39449ff0e67786815dfb8d2d30d79dcdba757d7
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace details {
+
+// Clean the TensorArray each batch to make the behavior the same with the
+// training phase.
+struct TensorArrayBatchCleaner {
+  // Fix the tensor array not clear in the inference scenarios.
+  void CollectTensorArrays(framework::Scope *scope);
+  void ResetTensorArray();
+
+ private:
+  bool flag_{true};
+  std::vector<framework::LoDTensorArray *> arrays_;
+};
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 24f59cf43a9700ff1732e1ef6ad82e1a6294eede..e46dc1326951f68fd030f2208b9bea1647d0026d 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms ======";
+            << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
+            << " ======";
   if (epoch > 1) {
     int samples = batch_size * epoch;
     LOG(INFO) << "====== sample number: " << samples
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index d2876dc27c8826c2f27be21fa0b9fef92d03067a..a755ccb93bdee018dfeaf91157e7971b4d4cd832 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -124,7 +124,7 @@ class ZeroCopyTensor {
   std::vector<std::vector<size_t>> lod() const;
 
  protected:
-  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
   void SetName(const std::string& name) { name_ = name; }
   void* FindTensor() const;
 
@@ -263,6 +263,7 @@ struct AnalysisConfig : public NativeConfig {
   bool enable_ir_optim = true;
   // Manually determine the IR passes to run.
   IrPassMode ir_mode{IrPassMode::kExclude};
+  // passes to be excluded/included
   std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
 
   // NOT stable yet.
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index f9bb66a6e9f81a10368db7710108c319860e940a..677f85152f202b514d0563f885d872c84faba19a 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -42,16 +42,22 @@ class Pool2dOpConverter : public OpConverter {
         boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
     std::vector<int> paddings =
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+    bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
 
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int nbDims = input_shape.nbDims;
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
     if (global_pooling == true) {
-      nvinfer1::Dims input_shape = input1->getDimensions();
-      int nbDims = input_shape.nbDims;
       nv_ksize.d[0] = input_shape.d[nbDims - 2];
       nv_ksize.d[1] = input_shape.d[nbDims - 1];
+      nv_strides.h() = 1;
+      nv_strides.w() = 1;
+      nv_paddings.h() = 0;
+      nv_paddings.w() = 0;
     }
-    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
 
     PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
 
@@ -64,6 +70,36 @@ class Pool2dOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported pooling type!");
     }
 
+    if (ceil_mode) {
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
+      int input_height = input_shape.d[nbDims - 2];
+      int input_width = input_shape.d[nbDims - 1];
+      int floor_h_output_size =
+          (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+      int ceil_h_output_size =
+          (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+              strides[0] +
+          1;
+
+      int floor_w_output_size =
+          (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+      int ceil_w_output_size =
+          (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+              strides[1] +
+          1;
+      if (floor_h_output_size != ceil_h_output_size) {
+        post_pad.h() = strides[0] - 1;
+      }
+
+      if (floor_w_output_size != ceil_w_output_size) {
+        post_pad.w() = strides[1] - 1;
+      }
+      auto* layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
+          post_pad);
+      input1 = layer->getOutput(0);
+    }
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
                                        *const_cast<nvinfer1::ITensor*>(input1),
                                        nv_pool_type, nv_ksize);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index aedd6b62df040eeee4e48f628128511cd8bf4439..ee597f8465c218c0fb6648374c128cabf7b033fb 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,18 +20,20 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void test_pool2d(bool global_pooling) {
+void test_pool2d(bool global_pooling, bool ceil_mode) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
   if (global_pooling)
     validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
+  else if (ceil_mode)
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
   else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -39,7 +41,7 @@ void test_pool2d(bool global_pooling) {
   desc.SetInput("X", {"pool2d-X"});
   desc.SetOutput("Out", {"pool2d-Out"});
 
-  std::vector<int> ksize({2, 2});
+  std::vector<int> ksize({3, 3});
   std::vector<int> strides({2, 2});
   std::vector<int> paddings({0, 0});
   std::string pooling_t = "max";
@@ -49,6 +51,7 @@ void test_pool2d(bool global_pooling) {
   desc.SetAttr("strides", strides);
   desc.SetAttr("paddings", paddings);
   desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", ceil_mode);
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
@@ -57,9 +60,10 @@ void test_pool2d(bool global_pooling) {
   validator.Execute(3);
 }
 
-TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
+TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
 
-TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
+TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/test.cmake b/paddle/fluid/inference/test.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7
--- /dev/null
+++ b/paddle/fluid/inference/test.cmake
@@ -0,0 +1,31 @@
+set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
+    "A path setting inference demo download directories.")
+function (inference_download install_dir url filename)
+    message(STATUS "Download inference test stuff from ${url}/${filename}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
+    message(STATUS "finish downloading ${filename}")
+endfunction()
+
+function (inference_download_and_uncompress install_dir url filename)
+    inference_download(${install_dir} ${url} ${filename})
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+endfunction()
+
+set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
+if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+    inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+endif()
+set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
+
+function (inference_base_test TARGET)
+   set(options "")
+   set(oneValueArgs "")
+   set(multiValueArgs SRCS ARGS DEPS)
+   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+   if(WITH_GPU)
+       set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+   endif()
+   cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
+endfunction()
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index c3dd1f433691e1c96e9f38ef7b595befad26408f..71fdc67068b3d92a774db82f569d212f6cffad78 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,18 +1,4 @@
-set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
-    "A path setting inference demo download directories.")
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
-function (inference_download install_dir url filename)
-    message(STATUS "Download inference test stuff from ${url}/${filename}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
-    message(STATUS "finish downloading ${filename}")
-endfunction()
-
-function (inference_download_and_uncompress install_dir url filename)
-    inference_download(${install_dir} ${url} ${filename})
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
-endfunction()
 
 function(download_model_and_data install_dir model_name data_name)
     if (NOT EXISTS ${install_dir})
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 290fb007d8ba94a2d121947fe67c6474586ac0e0..c2151eea0823f80feb17b014c1f739d2a15ae862 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -52,9 +52,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_resnet50, profile) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -69,6 +70,11 @@ TEST(Analyzer_resnet50, profile) {
   }
 }
 
+TEST(Analyzer_resnet50, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_resnet50, fuse_statis) {
   AnalysisConfig cfg;
@@ -82,15 +88,21 @@ TEST(Analyzer_resnet50, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_resnet50, compare) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   CompareNativeAndAnalysis(cfg, input_slots_all);
 }
 
+TEST(Analyzer_resnet50, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index c76d72ccd99649913aefcb2aa57fe6061db8ca6d..e0416ff953b61f56a2ca1a45cb382d40a6cffa4a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_rnn1, profile) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.use_gpu = false;
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -308,18 +309,13 @@ TEST(Analyzer_rnn1, ZeroCopy) {
   PaddlePlace place;
   int output_size{0};
 
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   config.use_feed_fetch_ops = true;
-  auto native_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
-  auto analysis_predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
 #define NEW_TENSOR(name__) \
   auto name__##_tensor = predictor->GetInputTensor(#name__);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index ba04d030b94c0924311dcff5c6a34270a764f877..e0eb919bd896d73a557001982a436fc93f087a74 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -18,12 +18,12 @@ namespace paddle {
 namespace inference {
 
 using namespace framework;  // NOLINT
+static std::vector<float> result_data;
 
 struct DataRecord {
   std::vector<std::vector<std::vector<float>>> link_step_data_all;
   std::vector<size_t> lod;
   std::vector<std::vector<float>> rnn_link_data;
-  std::vector<float> result_data;
   size_t num_samples;  // total number of samples
   size_t batch_iter{0};
   size_t batch_size{1};
@@ -57,6 +57,7 @@ struct DataRecord {
     std::ifstream file(path);
     std::string line;
     int num_lines = 0;
+    result_data.clear();
     while (std::getline(file, line)) {
       num_lines++;
       std::vector<std::string> data;
@@ -135,13 +136,12 @@ TEST(Analyzer_rnn2, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
     PADDLE_ENFORCE_GT(outputs.size(), 0);
     size_t size = GetSize(outputs[0]);
     PADDLE_ENFORCE_GT(size, 0);
     float *result = static_cast<float *>(outputs[0].data.data());
     for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
+      EXPECT_NEAR(result[i], result_data[i], 1e-3);
     }
   }
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index cb4671c4379b5f6f144bfd5330866aa38163f4d4..f590ef27967e47ffcb3a97e80dd147efdd1906e6 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
   SetConfig(&cfg);
   int num_ops;
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  GetFuseStatis(predictor.get(), &num_ops);
+
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6);
+  EXPECT_EQ(num_ops, 32);
 }
 
 // Compare result of NativeConfig and AnalysisConfig
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 305b8bfe158150d5dfd8bdaee2c0a89afe264de4..8933296490793a7693124eba23f8cf0801881e14 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->specify_input_name = true;
   // TODO(TJ): fix fusion gru
   cfg->ir_passes.push_back("fc_gru_fuse_pass");
-#ifdef PADDLE_WITH_MKLDNN
-  cfg->_use_mkldnn = true;
-#endif
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -84,9 +81,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 //  ocr, mobilenet and se_resnext50
-TEST(Analyzer_vis, profile) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -108,6 +106,12 @@ TEST(Analyzer_vis, profile) {
   }
 }
 
+TEST(Analyzer_vis, profile) { profile(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_vis, fuse_statis) {
   AnalysisConfig cfg;
@@ -118,15 +122,21 @@ TEST(Analyzer_vis, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_vis, compare) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg._use_mkldnn = use_mkldnn;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   CompareNativeAndAnalysis(cfg, input_slots_all);
 }
 
+TEST(Analyzer_vis, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8603d09cbd09e4cee254faf52c2ccbe8d661994d..19c3f532d5dcb7588793fa21fa179f6b48649103 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -50,7 +50,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
     auto &ref_out = ref_outputs[i];
     size_t size = VecReduceToInt(out.shape);
     size_t ref_size = VecReduceToInt(ref_out.shape);
-    EXPECT_GT(size, 0);
+    EXPECT_GT(size, 0UL);
     EXPECT_EQ(size, ref_size);
     EXPECT_EQ(out.dtype, ref_out.dtype);
     switch (out.dtype) {
@@ -77,11 +77,9 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
     const AnalysisConfig &config, bool use_analysis = true) {
   if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig,
-                                 PaddleEngineKind::kAnalysis>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
   } else {
-    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-        config);
+    return CreatePaddlePredictor<NativeConfig>(config);
   }
 }
 
@@ -141,6 +139,9 @@ void TestMultiThreadPrediction(
   }
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
+#ifdef PADDLE_WITH_MKLDNN
+      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
+#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
@@ -165,7 +166,8 @@ void TestPrediction(const AnalysisConfig &config,
                     const std::vector<std::vector<PaddleTensor>> &inputs,
                     std::vector<PaddleTensor> *outputs, int num_threads,
                     bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis;
+  LOG(INFO) << "use_analysis: " << use_analysis
+            << ", use_mkldnn: " << config._use_mkldnn;
   if (num_threads == 1) {
     TestOneThreadPrediction(config, inputs, outputs, use_analysis);
   } else {
@@ -177,6 +179,7 @@ void TestPrediction(const AnalysisConfig &config,
 void CompareNativeAndAnalysis(
     const AnalysisConfig &config,
     const std::vector<std::vector<PaddleTensor>> &inputs) {
+  LOG(INFO) << "use_mkldnn: " << config._use_mkldnn;
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
   TestOneThreadPrediction(config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index bf320a0cbc2fff5f973c48768281e26d0fde232b..91111f2af56065bbf57ba3a41bddd55ecced1060 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
   config1.model_dir = model_dirname;
   config1.max_batch_size = batch_size;
 
-  auto predictor0 =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
   // Prepare inputs
   int height = 224;
   int width = 224;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 031109398d8e21ad95f19f65fdd814e1782889e6..919ad96f7adfc5025d9a8367c467f639c6fe3101 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,7 +86,7 @@ function(op_library TARGET)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
      "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-     "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
+op_library(hash_op DEPS xxhash)
 op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
@@ -284,10 +285,10 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 if (NOT WIN32)
-op_library(lstm_op DEPS sequence2batch lstm_compute)
-op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-op_library(lstmp_op DEPS sequence2batch lstm_compute)
-op_library(gru_op DEPS sequence2batch gru_compute)
+    op_library(lstm_op DEPS sequence2batch lstm_compute)
+    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+    op_library(lstmp_op DEPS sequence2batch lstm_compute)
+    op_library(gru_op DEPS sequence2batch gru_compute)
 endif(NOT WIN32)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
@@ -300,11 +301,13 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
-op_library(fusion_lstm_op DEPS cpu_lstm_compute)
+op_library(crf_decoding_op DEPS jit_kernel)
+op_library(fusion_lstm_op DEPS jit_kernel)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
     op_library(reduce_mean_op DEPS cub)
+    op_library(affine_channel_op DEPS cub)
 else()
     op_library(conv_op DEPS vol2col im2col)
 endif()
@@ -315,7 +318,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat)
+op_library(concat_op DEPS concat_and_split)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
@@ -347,6 +350,6 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 if(NOT WIN32)
-nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index bbf52bea1358c32596ab6f14eeaa419735d19fc6..9ddb3a5d29f973047507855b43b226913a3600b5 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -28,7 +28,7 @@ using paddle::framework::Tensor;
    public:                                                              \
     void Make() override {                                              \
       AddInput("X", "Input of " #OP_NAME " operator");                  \
-      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
+      AddOutput("Out", "Output of " #OP_NAME " operator");              \
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 5d670fe3b9d99a31a628ff707ff860564eca952e..f3717af630017eba18aa265f3dbb496e18280a57 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -92,9 +92,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
     AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
 
-    AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param");
-    AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1");
-    AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2");
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 3e724f52e4fcb2818f961181a2830dfd653d2733..3455d1ee54e8e6e498d0b0e6932ec099af9c0b30 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
         row_numel_(row_numel),
         row_count_(row_count) {}
 
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
-
   inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
-    auto row_idx = BinarySearchInRows(row);
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
     T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
 
     // The following code is the same as dense
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8127e554bed1aae7a5ce8837bcadf1b7f13f1ac2
--- /dev/null
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/add_position_encoding_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AddPositionEncodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of add_position_encoding_op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Out(Output) of add_position_encoding_op should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Out@GRAD must not be null.");
+
+    auto out_dims = ctx->GetInputDim("Out");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    }
+  }
+};
+
+class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of AddPositionEncoding operator");
+    AddOutput("Out", "Output of AddPositionEncoding operator");
+    AddAttr<float>("alpha", "The scale of Original Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& alpha) {
+          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+        });
+    AddAttr<float>("beta", "The scale of Position Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& beta) {
+          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+        });
+    AddComment(R"DOC(
+    Add Position Encoding Operator.
+    
+    The add position encoding calculates the output based on the input, alpha, beta.
+    The size of each dimension of the parameters checked in the infer-shape.
+  )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+
+REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
+                  ops::AddPositionEncodingOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding_grad,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f371235f160c416058e877dbba2d9fe89abf7db
--- /dev/null
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AddPositionEncodingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::LoDTensor>("X");
+    auto& x_lod = X->lod();
+    auto* src_ptr = X->data<T>();
+
+    auto* Out = context.Output<framework::LoDTensor>("Out");
+    auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
+
+    float alpha = context.Attr<float>("alpha");
+    float beta = context.Attr<float>("beta");
+
+    auto x_dim = X->dims();
+    int batch_size = 0;
+    int max_seq_len = 0;
+    int enc_size = 0;
+
+    if (x_lod.empty()) {
+      PADDLE_ENFORCE(
+          x_dim.size() == 3UL,
+          "The input X of Add Position Encoding should be 3-D Tensor!");
+      batch_size = x_dim[0];
+      max_seq_len = x_dim[1];
+      enc_size = x_dim[2];
+    } else {
+      PADDLE_ENFORCE(
+          x_dim.size() == 2UL,
+          "The input X of Add Position Encoding should be 2-D LoDTensor!");
+      PADDLE_ENFORCE(
+          x_lod.size() == 1UL,
+          "The Add Position Encoding Op only supports lod_level == 1!");
+      batch_size = x_lod[0].size() - 1;
+      max_seq_len = -1;
+      enc_size = x_dim[1];
+    }
+
+    PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
+
+    const int half_size = enc_size / 2;
+    for (int i = 0; i < batch_size; ++i) {
+      const int max_length =
+          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+      for (int j = 0; j < max_length; ++j) {
+        for (int k = 0; k < half_size; ++k) {
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
+          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
+          dst_ptr[half_size + k] =
+              src_ptr[half_size + k] * alpha + cos(val) * beta;
+        }
+        src_ptr += enc_size;
+        dst_ptr += enc_size;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+
+    auto* dX =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+
+    float alpha = context.Attr<float>("alpha");
+
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    dx.device(*place) = dout * static_cast<T>(alpha);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8944a749674c3ba6c83526e4d66f449075716f43
--- /dev/null
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) Feature map input can be a 4D tensor with order NCHW "
+             "or NHWC. It also can be a 2D tensor and C is the second "
+             "dimension.");
+    AddInput("Scale",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the scale factor of the affine transformation "
+             "for the c-th channel of the input.");
+    AddInput("Bias",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the bias of the affine transformation for the "
+             "c-th channel of the input.");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddOutput("Out", "(Tensor) A tensor of the same shape and order with X.");
+    AddComment(R"DOC(
+
+Applies a separate affine transformation to each channel of the input. Useful
+for replacing spatial batch norm with its equivalent fixed transformation.
+The input also can be 2D tensor and applies a affine transformation in second
+dimension.
+
+$$Out = Scale*X + Bias$$
+
+)DOC");
+  }
+};
+
+class AffineChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AffineChannelOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class AffineChannelOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                     "Input(Scale) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("X"),
+                        ctx->GetInputDim(framework::GradVarName("Out")));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      // Scale@GRAD and Bias@GRAD must exist at the same time.
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Scale"));
+    }
+  }
+};
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename DeviceContext, typename T>
+class AffineChannelKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+
+    auto* scale_d = scale->data<T>();
+    auto* bias_d = bias->data<T>();
+    ConstEigenVectorArrayMap<T> a_e(scale_d, C);
+    ConstEigenVectorArrayMap<T> b_e(bias_d, C);
+
+    auto* x_d = x->data<T>();
+    auto* y_d = y->data<T>();
+    if (layout == framework::DataLayout::kNCHW) {
+      int stride = C * HxW;
+      for (int i = 0; i < N; i++) {
+        ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+        EigenArrayMap<T> y_e(y_d, HxW, C);
+        y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose();
+        x_d += stride;
+        y_d += stride;
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> x_e(x_d, C, num);
+      EigenArrayMap<T> y_e(y_d, C, num);
+      y_e = (x_e.colwise() * a_e).colwise() + b_e;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AffineChannelGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+
+    auto* x_d = x->data<T>();
+    auto* dy_d = dy->data<T>();
+    auto* scale_d = scale->data<T>();
+    ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
+
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dscale_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dbias_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    EigenVectorArrayMap<T> dscale_e(dscale_d, C);
+    EigenVectorArrayMap<T> dbias_e(dbias_d, C);
+
+    if (layout == framework::DataLayout::kNCHW) {
+      // compute dx
+      int stride = C * HxW;
+      if (dx) {
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          EigenArrayMap<T> dx_e(dx_d, HxW, C);
+          dx_e = dy_e.rowwise() * scale_e.transpose();
+          dy_d += stride;
+          dx_d += stride;
+        }
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        dy_d = dy->data<T>();
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          if (i == 0) {
+            dscale_e = (x_e * dy_e).colwise().sum();
+          } else {
+            dscale_e += (x_e * dy_e).colwise().sum();
+          }
+          if (i == 0) {
+            dbias_e = dy_e.colwise().sum();
+          } else {
+            dbias_e += dy_e.colwise().sum();
+          }
+          x_d += stride;
+          dy_d += stride;
+        }
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> dy_e(dy_d, C, num);
+      // compute dx
+      if (dx) {
+        EigenArrayMap<T> dx_e(dx_d, C, num);
+        dx_e = dy_e.colwise() * scale_e;
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        ConstEigenArrayMap<T> x_e(x_d, C, num);
+        dscale_e = (x_e * dy_e).rowwise().sum();
+        dbias_e = dy_e.rowwise().sum();
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
+                  ops::AffineChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
+
+REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
+                       ops::AffineChannelKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(affine_channel_grad,
+                       ops::AffineChannelGradKernel<CPU, float>,
+                       ops::AffineChannelGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2bebdb345ab324eb0a2dafd54c74833dd21bdb6d
--- /dev/null
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -0,0 +1,187 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, framework::DataLayout layout, bool HasBias>
+__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias,
+                                    const int C, const int HxW, const int num,
+                                    T* y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    if (HasBias) {
+      y[i] = scale[c] * x[i] + bias[c];
+    } else {
+      y[i] = scale[c] * x[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class AffineChannelCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+
+    const T* x_d = x->data<T>();
+    const T* scale_d = scale->data<T>();
+    const T* bias_d = bias->data<T>();
+    T* y_d = y->data<T>();
+
+    int block = 1024;
+    int grid = (num + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    } else {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    }
+  }
+};
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void AffineChannelScaleBiasGradientCUDAKernel(
+    const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale,
+    T* dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = 0;
+    T db_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += dy[index] * x[index];
+      db_sum += dy[index];
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename DeviceContext, typename T>
+class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+
+    const T* x_d = x->data<T>();
+    const T* dy_d = dy->data<T>();
+    const T* s_d = scale->data<T>();
+
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* ds_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* db_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+
+    const int block = 1024;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(max_threads / block, 1);
+    int grid1 = (num + block - 1) / block;
+    int grid2 = std::min(C, max_blocks);
+    if (layout == framework::DataLayout::kNCHW) {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    } else {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(affine_channel,
+                        ops::AffineChannelCUDAKernel<CUDA, float>,
+                        ops::AffineChannelCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(affine_channel_grad,
+                        ops::AffineChannelGradCUDAKernel<CUDA, float>,
+                        ops::AffineChannelGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed71594ba5781590f3291d56c4ba1a4443003bd5
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedSpatialTransformerDescriptor =
+    platform::ScopedSpatialTransformerDescriptor;
+
+template <typename T>
+class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    auto* output = ctx.Output<Tensor>("Output");
+    const T* theta_data = theta->data<T>();
+
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    T* output_data = output->mutable_data<T>(
+        {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
+        handle, cudnn_st_desc, theta_data, output_data));
+  }
+};
+
+template <typename T>
+class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    const T* output_grad_data = output_grad->data<T>();
+    T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
+        handle, cudnn_st_desc, output_grad_data, theta_grad_data));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridOpKernel<double>);
+REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ea28265a245c9cd1a35a79324a33f7cf208a159
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -0,0 +1,233 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Linspace<paddle::platform::CPUDeviceContext, T> {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx) {
+    Tensor numbers;
+    T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
+    T slice = (end - start) / (T)(count - 1);
+    for (int i = 0; i < count; ++i) {
+      number_data[i] = start + (T)i * slice;
+    }
+    return numbers;
+  }
+};
+
+class AffineGridOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Theta"),
+                   "Input(Theta) of AffineGridOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of AffineGridOp should not be null.");
+    auto theta_dims = ctx->GetInputDim("Theta");
+    PADDLE_ENFORCE(theta_dims.size() == 3,
+                   "AffineGrid's Input(Theta) should be 3-D tensor.");
+
+    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
+    if (output_shape.size() == 0) {
+      PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
+                     "Input(OutputShape) of AffineGridOp should not be null if "
+                     "attr(output_shape) is not configured.");
+      auto output_shape_dims = ctx->GetInputDim("OutputShape");
+      PADDLE_ENFORCE(output_shape_dims.size() == 1,
+                     "AffineGrid's Input(OutputShape) should be 1-D tensor.");
+    } else {
+      PADDLE_ENFORCE(output_shape.size() == 4,
+                     "The size of attr(output_shape) should be 4.");
+    }
+
+    PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
+    PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
+    // N * H * W * 2
+    ctx->SetOutputDim("Output",
+                      framework::make_ddim({theta_dims[0], -1, -1, 2}));
+    ctx->ShareLoD("Theta", "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kCUDNN;
+    }
+#endif
+    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
+    return framework::OpKernelType(data_type, ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library);
+  }
+};
+
+class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Theta",
+        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
+        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
+        "y_1).");
+    AddInput("OutputShape",
+             "(Tensor) The shape of target image with format [N, C, H, W].")
+        .AsDispensable();
+    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>(
+        "output_shape",
+        "The target output image shape with format [N, C, H, W].")
+        .SetDefault(std::vector<int>());
+
+    AddComment(R"DOC(
+    It generates a grid of (x,y) coordinates using the parameters of the
+    affine transformation that correspond to a set of points where the input
+    feature map should be sampled to produce the transformed output feature map.
+
+    Given:
+        Theta = [[[x_11, x_12, x_13]
+                  [x_14, x_15, x_16]]
+                 [[x_21, x_22, x_23]
+                  [x_24, x_25, x_26]]]
+    
+        OutputShape = [2, 3, 5, 5]
+
+    Step 1:
+
+        Generate relative coordinates according to OutputShape.
+        The values of relative coordinates are in the interval between -1 and 1.
+        The shape of the relative coordinates is [2, H, W] as below:
+    
+        C = [[[-1.  -1.  -1.  -1.  -1. ]
+              [-0.5 -0.5 -0.5 -0.5 -0.5]
+              [ 0.   0.   0.   0.   0. ]
+              [ 0.5  0.5  0.5  0.5  0.5]
+              [ 1.   1.   1.   1.   1. ]] 
+             [[-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]]]
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+    
+    Step2:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        C_ = [[-1.  -1.   1. ]
+              [-0.5 -1.   1. ]
+              [ 0.  -1.   1. ]
+              [ 0.5 -1.   1. ]
+              [ 1.  -1.   1. ]
+              [-1.  -0.5  1. ]
+              [-0.5 -0.5  1. ]
+              [ 0.  -0.5  1. ]
+              [ 0.5 -0.5  1. ]
+              [ 1.  -0.5  1. ]
+              [-1.   0.   1. ]
+              [-0.5  0.   1. ]
+              [ 0.   0.   1. ]
+              [ 0.5  0.   1. ]
+              [ 1.   0.   1. ]
+              [-1.   0.5  1. ]
+              [-0.5  0.5  1. ]
+              [ 0.   0.5  1. ]
+              [ 0.5  0.5  1. ]
+              [ 1.   0.5  1. ]
+              [-1.   1.   1. ]
+              [-0.5  1.   1. ]
+              [ 0.   1.   1. ]
+              [ 0.5  1.   1. ]
+              [ 1.   1.   1. ]]
+    Step3:
+        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+    )DOC");
+  }
+};
+
+class AffineGridOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto theta_dims = ctx->GetInputDim("Theta");
+    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
+      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
+        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_grid_grad");
+    op->SetInput("Theta", Input("Theta"));
+    op->SetInput("OutputShape", Input("OutputShape"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
+                  ops::AffineGridGradMaker);
+REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    affine_grid,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    affine_grid_grad,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..07e26c292c3bafc4d98bd392a9e1e21a9eb383a8
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<int64_t, 1>;
+using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array3 = Eigen::DSizes<int64_t, 3>;
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+/**
+ *Return a tensor with evenly spaced numbers over a specified interval.
+ */
+template <typename DeviceContext, typename T>
+struct Linspace {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx);
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), output,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
+      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
+      blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
+                  T(0));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), theta_grad,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
+      blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
+                  &sliced_theta_grad, T(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index b8b8b2290a0f002fd379032e28590b84a1da38e9..6257e04b010d8c580e69e466759e8e80d344c105 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <paddle/fluid/operators/math/concat.h>
+#include <paddle/fluid/operators/math/concat_and_split.h>
 #include <numeric>
 
 #include "paddle/fluid/framework/lod_rank_table.h"
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 5912a1a17cbd29c3ebd83f37133c044f0905c8bd..3eb473832577bd348b33ba9b0be9e597b78f26bc 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,15 +135,13 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization").Reuse("X");
+    AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
-              "Store the global mean when training")
-        .Reuse("Mean");
+              "Store the global mean when training");
     AddOutput("VarianceOut",
               "Share memory with Variance. "
-              "Store the global Variance when training")
-        .Reuse("Variance");
+              "Store the global Variance when training");
     AddOutput("SavedMean",
               "Mean of the current mini batch, "
               "will apply to output when training")
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index b6cb935814e25b31d4104f9ce24fe952680cb491..0d32cae0e1e5ff274793df50e854283d8e2f7bf8 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor {
   bool tensor_on_gpu_;
   size_t beam_size_;
   int end_id_;
+  // TODO(Superjomn) Here might result serious performance issue in the
+  // concurrency
+  // scenarios.
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index b2c6495c442cd02679825425becc2160c303dcc6..bd474be0facb349c53a8766412311296383a86c5 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -89,29 +89,17 @@ class ConcatGradKernel : public framework::OpKernel<T> {
         outputs.push_back(nullptr);
       }
     }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     // Sometimes direct copies will be faster, this maybe need deeply analysis.
     if (axis == 0 && outs.size() < 10) {
-      size_t input_offset = 0;
-      const auto in_stride = framework::stride_numel(out_grad->dims());
-
-      for (size_t i = 0; i < outs.size(); ++i) {
-        auto out_stride = framework::stride_numel(ins[i]->dims());
-        auto* out = outputs[i];
-        if (out != nullptr) {
-          StridedNumelCopyWithAxis<T>(
-              ctx.device_context(), axis, out->data<T>(), out_stride,
-              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
-        }
-        input_offset += out_stride[axis];
-      }
+      std::vector<const framework::Tensor*> ref_shape;
+      ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
+      StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
     } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
-          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *out_grad,
-                          ctx.MultiInput<framework::Tensor>("X"),
-                          static_cast<int>(axis), &outputs);
+      math::SplitFunctor<DeviceContext, T> split_functor;
+      split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
+                    static_cast<int>(axis), &outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index eae65968285703f5882d910e29bc5d8e1511cba6..521f423fb022098e6930c333af6b5e54c502cb7e 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     bool fuse_relu = ctx.Attr<bool>("fuse_relu");
-    bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
+    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
     int groups = ctx.Attr<int>("groups");
 
-    // TODO: add support for dilation
+    // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
@@ -369,11 +369,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
       conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
                                      strides, paddings, mkldnn_engine,
-                                     fuse_relu, fuse_eltwise);
+                                     fuse_relu, fuse_residual_conn);
     } else {
       conv_pd =
           ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                               mkldnn_engine, fuse_relu, fuse_eltwise);
+                               mkldnn_engine, fuse_relu, fuse_residual_conn);
     }
     // Save conv_pd/src_memory/weights_memory for backward pass
     dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -386,8 +386,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto user_weights_memory_p = handler.AcquireWeightsMemory(
         user_weights_md, to_void_cast<T>(filter_data));
 
-    T* output_data =
-        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+    T* output_data = nullptr;
+
+    if (fuse_residual_conn) {
+      auto residual_param = ctx.Input<Tensor>("ResidualData");
+      auto residual_param_data = residual_param->data<T>();
+
+      PADDLE_ENFORCE(
+          residual_param_data != nullptr,
+          "Provide data if you want MKLDNN conv+elementwise_add fusion");
+      PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
+                        "Output and elementwise parameter need to have the "
+                        "same dimension sizes");
+
+      output->ShareDataWith(*residual_param);
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      output_data =
+          output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+    }
+
     // create reorder primitive if the input format is not the preferred one
     auto src_memory_p =
         handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@@ -424,14 +442,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
  private:
   mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
-                                       bool fuse_eltwise) const {
+                                       bool fuse_residual_conn) const {
     mkldnn::primitive_attr conv_attr;
     mkldnn::post_ops post_operations;
     // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_eltwise is true, the
-    // Output tensor contains the data coming from residual connection. The
-    // result of this post_op is: Output = scale * Output + Conv_Out.
-    if (fuse_eltwise) {
+    // the scale parameter. It is assumed that when fuse_residual_connection is
+    // true, the output tensor contains the data coming from residual
+    // connection. The result of this post_op is:
+    // Output = scale * Output + Conv_Out.
+    if (fuse_residual_conn) {
       post_operations.append_sum(1.0f);
     }
     // Fusion with ReLU layer is executed through the PostOps feature. Create a
@@ -452,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const memory::desc& dst, const std::vector<int>& strides,
                        const std::vector<int>& paddings,
                        const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_eltwise) const {
+                       const bool fuse_residual_conn) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
@@ -461,7 +480,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         dst, stride_dims, padding_dims, padding_dims,
         mkldnn::padding_kind::zero);
 
-    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, fuse_residual_conn);
 
     auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
         conv_desc, conv_attr, engine);
@@ -476,7 +496,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const std::vector<int>& strides,
                        const std::vector<int>& paddings,
                        const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_eltwise) const {
+                       const bool fuse_residual_conn) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
@@ -485,7 +505,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         bias, dst, stride_dims, padding_dims, padding_dims,
         mkldnn::padding_kind::zero);
 
-    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, fuse_residual_conn);
 
     auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
         conv_desc, conv_attr, engine);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8f84bf71a7f77606bed6672f0830e3fc80165a42..2cd9979bd3426a15af34a49002d5db2fdd9aeec7 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -130,8 +130,12 @@ void Conv2DOpMaker::Make() {
       .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.")
-      .Reuse("Input");
+            "The format of output tensor is also NCHW.");
+  AddInput("ResidualData",
+           "(Tensor) Tensor with residual data "
+           "to which convolution output will be added."
+           "Used with fuse_residual_connection fusion.")
+      .AsDispensable();
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -164,10 +168,10 @@ void Conv2DOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>("fuse_eltwise",
+  AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
-                "whenever convolution output is connected via skip connection "
-                "to a previous layer.")
+                "whenever convolution output is as an input to residual "
+                "connection.")
       .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
@@ -233,8 +237,7 @@ void Conv3DOpMaker::Make() {
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.")
-      .Reuse("Input");
+            "The format of output tensor is also NCDHW.");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default:{1, 1, 1}), the "
                             "strides(d_stride, h_stride, w_stride) of "
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 8181897c3d3844bda5574e85a08b2af038fcd664..e9d2e84a434d7084c526a6e75363a65577197262 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     auto emission_dims = emission_weights.dims();
     const size_t seq_len = emission_dims[0];
     const size_t tag_num = emission_dims[1];
-
-    const size_t state_trans_base_idx = 2;
-
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
     int64_t* path = decoded_path->data<int64_t>();
@@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     Tensor track;
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
-
-#ifdef __AVX__
-// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
-// 16 elements per iteration. Then it can implement the parallel processing.
-// Only optimize for float type.
-#ifdef __AVX512F__
-    size_t step_size = 16;
-#else
-    size_t step_size = 8;
-#endif
-    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
-      size_t steps = tag_num / step_size;
-      size_t remain = tag_num % step_size;
-      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
-
-      // Setup the alpha initial value.
-      size_t i_offset = 0;
-      for (size_t i = 0; i <= steps; ++i) {
-#ifdef __AVX512F__
-        // Declare the variable for the content of weights, input and alpha
-        // values.
-        __m512 w_content, x_content, alpha_content;
-
-        // Load the relevant data into the variables from un-aligned address.
-        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
-        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
-        alpha_content = _mm512_add_ps(w_content, x_content);
-
-        // Save the alpha value.
-        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
-                         alpha_content);
-#else
-        // Declare the variable for the content of weights, input and alpha
-        // values.
-        __m256 w_content, x_content, alpha_content;
-
-        // Load the relevant data into the variables from un-aligned address.
-        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
-        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
-        alpha_content = _mm256_add_ps(w_content, x_content);
-
-        // Save the alpha value.
-        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
-                         alpha_content);
-#endif
-        i_offset += step_size;
-        if (i == steps - 1) {
-          if (remain > 0) {
-            i_offset += last_offset;
-          } else {
-            break;
-          }
-        }
-      }
-
-      // Use the column-major strategy to get the location of maximum score.
-      size_t seq_offset = 0;
-      for (size_t k = 1; k < seq_len; ++k) {
-        size_t j_offset = 0;
-        for (size_t j = 0; j <= steps; ++j) {
-#ifdef __AVX512F__
-          // Initialize the variables of maximum score and location.
-          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
-          __m512i max_j = _mm512_setzero_si512();
-#else
-          // Initialize the variables of maximum score and location.
-          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
-          __m256i max_j = _mm256_set1_epi32(0);
-#endif
-          // Calculate the offset of transition_weights.
-          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
-          for (size_t i = 0; i < tag_num; ++i) {
-#ifdef __AVX512F__
-            // Initalize the content of alpha variable with related offset.
-            __m512 alpha_content =
-                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
-            // Obtain the content of weights from un-aligned address.
-            __m512 w_content =
-                _mm512_loadu_ps((const float*)(w + trans_offset));
-
-            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
-
-            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
-
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
-
-            // Update the max_score value.
-            max_score = _mm512_max_ps(max_score, score_v);
-#else
-            // Initalize the content of alpha variable with related offset.
-            __m256 alpha_content = _mm256_broadcast_ss(
-                (const float*)(alpha_value + seq_offset + i));
-            // Obtain the content of weights from un-aligned address.
-            __m256 w_content =
-                _mm256_loadu_ps((const float*)(w + trans_offset));
-            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
-
-            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
-
-#ifdef __AVX2__
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm256_or_si256(
-                _mm256_andnot_si256((__m256i)mask, max_j),
-                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
-#else
-            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
-            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
-            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
-            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
-
-            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
-            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
-            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
-            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
-
-            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
-            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
-
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
-            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
-#endif
-
-            // Update the max_score value.
-            max_score = _mm256_max_ps(max_score, score_v);
-#endif
-            trans_offset += tag_num;
-          }
-
-#ifdef __AVX512F__
-          // Update the alpha and track values.
-          __m512 x_content = _mm512_loadu_ps(
-              (const float*)(x + seq_offset + tag_num + j_offset));
-          max_score = _mm512_add_ps(max_score, x_content);
-          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
-                                                    tag_num + j_offset),
-                           max_score);
-          _mm512_storeu_si512(
-              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
-                                         j_offset),
-              max_j);
-#else
-          // Update the alpha and track values.
-          __m256 x_content = _mm256_loadu_ps(
-              (const float*)(x + seq_offset + tag_num + j_offset));
-          max_score = _mm256_add_ps(max_score, x_content);
-          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
-                                                    tag_num + j_offset),
-                           max_score);
-          _mm256_storeu_si256(
-              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
-                                         j_offset),
-              max_j);
-#endif
-
-          // Calculate the offset of next step
-          j_offset += step_size;
-          if (j == steps - 1) {
-            if (remain > 0) {
-              j_offset += last_offset;
-            } else {
-              break;
-            }
-          }
-        }
-
-        seq_offset += tag_num;
-      }
-    } else {
-      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-      for (size_t k = 1; k < seq_len; ++k) {
-        for (size_t i = 0; i < tag_num; ++i) {
-          T max_score = -std::numeric_limits<T>::max();
-          int max_j = 0;
-          for (size_t j = 0; j < tag_num; ++j) {
-            T score = alpha_value[(k - 1) * tag_num + j] +
-                      w[(j + state_trans_base_idx) * tag_num + i];
-            if (score > max_score) {
-              max_score = score;
-              max_j = j;
-            }
-          }
-
-          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-          track_value[k * tag_num + i] = max_j;
-        }
-      }
-    }
-#else
-    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-    for (size_t k = 1; k < seq_len; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T max_score = -std::numeric_limits<T>::max();
-        int max_j = 0;
-        for (size_t j = 0; j < tag_num; ++j) {
-          T score = alpha_value[(k - 1) * tag_num + j] +
-                    w[(j + state_trans_base_idx) * tag_num + i];
-          if (score > max_score) {
-            max_score = score;
-            max_j = j;
-          }
-        }
-
-        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-        track_value[k * tag_num + i] = max_j;
-      }
-    }
-
-#endif
+    const auto& ker = math::jitkernel::KernelPool::Instance()
+                          .template Get<math::jitkernel::CRFDecodeKernel<T>>(
+                              static_cast<int>(tag_num));
+    ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638..89416f7ab5d07ddac5b540b9bb361f831c1ef360 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
   }
 };
 
+class DeleteVarOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -48,4 +53,5 @@ It should not be configured by users directly.
 
 REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
                   paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::DeleteVarOpInfoMaker);
+                  paddle::operators::DeleteVarOpInfoMaker,
+                  paddle::operators::DeleteVarOpShapeInference);
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index aa8ed502fc94bd0970dfe5dbf00ef090e799ad30..d5eec148f9b4f76866ec9fca98a596b9bc2860ef 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -20,7 +20,7 @@ detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
 detection_library(iou_similarity_op SRCS iou_similarity_op.cc
 iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
 anchor_generator_op.cu)
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index d7a53f1bef98ecda3ba7b36323678a11a632a15c..fddd6884017c35112ba48f245759f5d846b55f9a 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -439,31 +439,88 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
 class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    // TODO(buxingyuan): Add Document
-    AddInput("RpnRois", "RpnRois.");
-    AddInput("GtClasses", "GtClasses.");
-    AddInput("IsCrowd", "IsCrowd.");
-    AddInput("GtBoxes", "GtBoxes.");
-    AddInput("ImInfo", "ImInfo.");
-
-    AddOutput("Rois", "Rois.");
-    AddOutput("LabelsInt32", "LabelsInt32.");
-    AddOutput("BboxTargets", "BboxTargets.");
-    AddOutput("BboxInsideWeights", "BboxInsideWeights.");
-    AddOutput("BboxOutsideWeights", "BboxOutsideWeights.");
-
-    AddAttr<int>("batch_size_per_im", "batch_size_per_im");
-    AddAttr<float>("fg_fraction", "fg_fraction");
-    AddAttr<float>("fg_thresh", "fg_thresh");
-    AddAttr<float>("bg_thresh_hi", "bg_thresh_hi");
-    AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
-    AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
-    AddAttr<int>("class_nums", "class_nums");
-    AddAttr<bool>("use_random", "use_random").SetDefault(true);
+    AddInput(
+        "RpnRois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. "
+        "N is the number of the GenerateProposalOp's output, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtBoxes",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. "
+        "M is the number of groundtruth, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+
+    AddOutput(
+        "Rois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P usuall equal to  batch_size_per_im * batch_size, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("LabelsInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "each element repersents a class label of a roi");
+    AddOutput("BboxTargets",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+              "class_nums], "
+              "each element repersents a box label of a roi");
+    AddOutput(
+        "BboxInsideWeights",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+        "class_nums], "
+        "each element indicates whether a box should contribute to loss.");
+    AddOutput(
+        "BboxOutsideWeights",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+        "class_nums], "
+        "each element indicates whether a box should contribute to loss.");
+
+    AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
+    AddAttr<float>("fg_fraction",
+                   "Foreground fraction in total batch_size_per_im.");
+    AddAttr<float>(
+        "fg_thresh",
+        "Overlap threshold which is used to chose foreground sample.");
+    AddAttr<float>("bg_thresh_hi",
+                   "Overlap threshold upper bound which is used to chose "
+                   "background sample.");
+    AddAttr<float>("bg_thresh_lo",
+                   "Overlap threshold lower bound which is used to chose "
+                   "background sample.");
+    AddAttr<std::vector<float>>("bbox_reg_weights", "Box regression weights.");
+    AddAttr<int>("class_nums", "Class number.");
+    AddAttr<bool>(
+        "use_random",
+        "Use random sampling to choose foreground and background boxes.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
-Generate Proposals Labels Operator.
-)DOC");
+This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
+to sample foreground boxes and background boxes, and compute loss target.
+
+RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
+were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
+If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
+If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
+then it was considered as a background sample.
+After all foreground and background boxes are chosen (so called Rois),
+then we apply random sampling to make sure
+the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
+
+For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
+Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
+    )DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 818d58ea9ee327fd99182ad2f8cbeed07e6aaea2..709c2dfc4b7c67d7d04074c58ce6da85b6e790fe 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cmath>
+#include <cstring>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -25,21 +27,17 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-struct AppendProposalsFunctor {
-  LoDTensor *out_;
-  int64_t offset_;
-  Tensor *to_add_;
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
 
-  AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add)
-      : out_(out), offset_(offset), to_add_(to_add) {}
-
-  template <typename T>
-  void apply() const {
-    auto *out_data = out_->data<T>();
-    auto *to_add_data = to_add_->data<T>();
-    memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
-  }
-};
+static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
+  auto *out_data = dst->data<void>();
+  auto *to_add_data = src.data<void>();
+  size_t size_of_t = framework::SizeOfType(src.type());
+  offset *= size_of_t;
+  std::memcpy(
+      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
+      to_add_data, src.numel() * size_of_t);
+}
 
 class GenerateProposalsOp : public framework::OperatorWithKernel {
  public:
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 };
 
 template <class T>
-void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
-              Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) {
+static inline void BoxCoder(const platform::DeviceContext &ctx,
+                            Tensor *all_anchors, Tensor *bbox_deltas,
+                            Tensor *variances, Tensor *proposals) {
   T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
 
   int64_t row = all_anchors->dims()[0];
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
                       anchor_center_y;
       bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
                                             bbox_deltas_data[i * len + 2],
-                                        std::log(1000.0 / 16.0))) *
+                                        kBBoxClipDefault)) *
                    anchor_width;
       bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
                                              bbox_deltas_data[i * len + 3],
-                                         std::log(1000.0 / 16.0))) *
+                                         kBBoxClipDefault)) *
                     anchor_height;
     } else {
       bbox_center_x =
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
       bbox_center_y =
           bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
       bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        std::log(1000.0 / 16.0))) *
+                                        kBBoxClipDefault)) *
                    anchor_width;
       bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         std::log(1000.0 / 16.0))) *
+                                         kBBoxClipDefault)) *
                     anchor_height;
     }
 
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
 }
 
 template <class T>
-void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info,
-                    Tensor *boxes) {
+static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
+                                  const Tensor &im_info, Tensor *boxes) {
   T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
   const T *im_info_data = im_info.data<T>();
+  T zero(0);
   for (int64_t i = 0; i < boxes->numel(); ++i) {
     if (i % 4 == 0) {
       boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
     } else if (i % 4 == 1) {
       boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
     } else if (i % 4 == 2) {
       boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
     } else {
       boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
     }
   }
 }
 
 template <class T>
-void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
-                 float min_size, const Tensor &im_info, Tensor *keep) {
+static inline void FilterBoxes(const platform::DeviceContext &ctx,
+                               Tensor *boxes, float min_size,
+                               const Tensor &im_info, Tensor *keep) {
   const T *im_info_data = im_info.data<T>();
   T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
   T im_scale = im_info_data[2];
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
   keep->Resize({keep_len});
 }
 
-bool SortScorePairDescend(const std::pair<float, int> &pair1,
-                          const std::pair<float, int> &pair2) {
-  return pair1.first > pair2.first;
-}
-
 template <class T>
-void GetMaxScoreIndex(const std::vector<T> &scores,
-                      std::vector<std::pair<T, int>> *sorted_indices) {
+static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
+    const std::vector<T> &scores) {
+  std::vector<std::pair<T, int>> sorted_indices;
+  sorted_indices.reserve(scores.size());
   for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices->push_back(std::make_pair(scores[i], i));
+    sorted_indices.emplace_back(scores[i], i);
   }
   // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend);
+  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
+                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
+                     return a.first < b.first;
+                   });
+  return sorted_indices;
 }
 
 template <class T>
-T BBoxArea(const T *box, const bool normalized) {
+static inline T BBoxArea(const T *box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
     // If coordinate values are is invalid
     // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
 }
 
 template <class T>
-T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
+static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
   if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
       box2[3] < box1[1]) {
     return static_cast<T>(0.);
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
     const T inter_ymin = std::max(box1[1], box2[1]);
     const T inter_xmax = std::min(box1[2], box2[2]);
     const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
+    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
     const T inter_area = inter_w * inter_h;
     const T bbox1_area = BBoxArea<T>(box1, normalized);
     const T bbox2_area = BBoxArea<T>(box2, normalized);
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
   }
 }
 
+template <typename T>
+static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
+                                    int selected_num) {
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
+
 template <class T>
-Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
-           const T nms_threshold, const float eta) {
+static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
+                         Tensor *scores, T nms_threshold, float eta) {
   PADDLE_ENFORCE_NOT_NULL(bbox);
   int64_t num_boxes = bbox->dims()[0];
   // 4: [xmin ymin xmax ymax]
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
 
   std::vector<T> scores_data(num_boxes);
   std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex<T>(scores_data, &sorted_indices);
+  std::vector<std::pair<T, int>> sorted_indices =
+      GetSortedScoreIndex<T>(scores_data);
 
   std::vector<int> selected_indices;
   int selected_num = 0;
   T adaptive_threshold = nms_threshold;
   const T *bbox_data = bbox->data<T>();
-  bool flag;
   while (sorted_indices.size() != 0) {
-    int idx = sorted_indices.front().second;
-    flag = true;
-    for (size_t k = 0; k < selected_indices.size(); ++k) {
+    int idx = sorted_indices.back().second;
+    bool flag = true;
+    for (int kept_idx : selected_indices) {
       if (flag) {
-        const int kept_idx = selected_indices[k];
         T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                       bbox_data + kept_idx * box_size, false);
         flag = (overlap <= adaptive_threshold);
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
     }
     if (flag) {
       selected_indices.push_back(idx);
-      selected_num++;
+      ++selected_num;
     }
-    sorted_indices.erase(sorted_indices.begin());
+    sorted_indices.erase(sorted_indices.end() - 1);
     if (flag && eta < 1 && adaptive_threshold > 0.5) {
       adaptive_threshold *= eta;
     }
   }
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-
-  return keep_nms;
+  return VectorToTensor(selected_indices, selected_num);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GenerateProposalsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *scores = context.Input<Tensor>("Scores");
     auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
     auto *im_info = context.Input<Tensor>("ImInfo");
-    auto *anchors = context.Input<Tensor>("Anchors");
-    auto *variances = context.Input<Tensor>("Variances");
+    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
+                               "Cannot find input Anchors(%s) in scope",
+                               context.Inputs("Anchors")[0]);
+    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
+                                 "Cannot find input Variances(%s) in scope",
+                                 context.Inputs("Variances")[0]);
 
     auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
     auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     float min_size = context.Attr<float>("min_size");
     float eta = context.Attr<float>("eta");
 
-    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
 
-    auto scores_dim = scores->dims();
+    auto &scores_dim = scores->dims();
     int64_t num = scores_dim[0];
     int64_t c_score = scores_dim[1];
     int64_t h_score = scores_dim[2];
     int64_t w_score = scores_dim[3];
 
-    auto bbox_dim = bbox_deltas->dims();
+    auto &bbox_dim = bbox_deltas->dims();
     int64_t c_bbox = bbox_dim[1];
     int64_t h_bbox = bbox_dim[2];
     int64_t w_bbox = bbox_dim[3];
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<DeviceContext, T, 4> trans;
+    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
 
     framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
-    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
-    anchor->Resize({anchors->numel() / 4, 4});
-    Tensor *var = const_cast<framework::Tensor *>(variances);
-    var->Resize({var->numel() / 4, 4});
+    lod.resize(1);
+    auto &lod0 = lod[0];
+    lod0.push_back(0);
+    anchors.Resize({anchors.numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
       std::pair<Tensor, Tensor> tensor_pair =
-          ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var,
+          ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances,
                               bbox_deltas_slice, scores_slice, pre_nms_top_n,
                               post_nms_top_n, nms_thresh, min_size, eta);
-      Tensor proposals = tensor_pair.first;
-      Tensor scores = tensor_pair.second;
-
-      framework::VisitDataType(
-          framework::ToDataType(rpn_rois->type()),
-          AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
-      framework::VisitDataType(
-          framework::ToDataType(rpn_roi_probs->type()),
-          AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
+      Tensor &proposals = tensor_pair.first;
+      Tensor &scores = tensor_pair.second;
 
+      AppendProposals(rpn_rois, 4 * num_proposals, proposals);
+      AppendProposals(rpn_roi_probs, num_proposals, scores);
       num_proposals += proposals.dims()[0];
-      lod0.emplace_back(num_proposals);
+      lod0.push_back(num_proposals);
     }
-
-    lod.emplace_back(lod0);
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
     rpn_rois->Resize({num_proposals, 4});
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
   }
 
   std::pair<Tensor, Tensor> ProposalForOneImage(
-      const DeviceContext &ctx, const Tensor &im_info_slice,
+      const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice,
       const Tensor &anchors, const Tensor &variances,
       const Tensor &bbox_deltas_slice,  // [M, 4]
       const Tensor &scores_slice,       // [N, 1]
@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     for (int i = 0; i < scores_slice.numel(); ++i) {
       index[i] = i;
     }
-    std::function<bool(const int64_t &, const int64_t &)> compare =
-        [scores_data](const int64_t &i, const int64_t &j) {
-          return scores_data[i] > scores_data[j];
-        };
+    auto compare = [scores_data](const int64_t &i, const int64_t &j) {
+      return scores_data[i] > scores_data[j];
+    };
 
     if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
       std::sort(index, index + scores_slice.numel(), compare);
@@ -452,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Scores", "The scores of anchors should be foreground.");
-    AddInput("BboxDeltas", "bbox_deltas.");
-    AddInput("ImInfo", "Information for image reshape.");
-    AddInput("Anchors", "All anchors.");
-    AddInput("Variances", " variances");
-
-    AddOutput("RpnRois", "Anchors.");
-    AddOutput("RpnRoiProbs", "Anchors.");
-    AddAttr<int>("pre_nms_topN", "pre_nms_topN");
-    AddAttr<int>("post_nms_topN", "post_nms_topN");
-    AddAttr<float>("nms_thresh", "nms_thres");
-    AddAttr<float>("min_size", "min size");
+    AddInput("Scores",
+             "(Tensor) The scores from conv is in shape (N, A, H, W), "
+             "N is batch size, A is number of anchors, "
+             "H and W are height and width of the feature map");
+    AddInput("BboxDeltas",
+             "(Tensor) Bounding box deltas from conv is in "
+             "shape (N, 4*A, H, W).");
+    AddInput("ImInfo",
+             "(Tensor) Information for image reshape is in shape (N, 3), "
+             "in format (height, width, scale)");
+    AddInput("Anchors",
+             "(Tensor) Bounding box anchors from anchor_generator_op "
+             "is in shape (A, H, W, 4).");
+    AddInput("Variances",
+             "(Tensor) Bounding box variances with same shape as `Anchors`.");
+
+    AddOutput("RpnRois",
+              "(LoDTensor), Output proposals with shape (rois_num, 4).");
+    AddOutput("RpnRoiProbs",
+              "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
+    AddAttr<int>("pre_nms_topN",
+                 "Number of top scoring RPN proposals to keep before "
+                 "applying NMS.");
+    AddAttr<int>("post_nms_topN",
+                 "Number of top scoring RPN proposals to keep after "
+                 "applying NMS");
+    AddAttr<float>("nms_thresh", "NMS threshold used on RPN proposals.");
+    AddAttr<float>("min_size",
+                   "Proposal height and width both need to be greater "
+                   "than this min_size.");
     AddAttr<float>("eta", "The parameter for adaptive NMS.");
     AddComment(R"DOC(
-Generate Proposals OP
-
-This operator proposes rois according to each box with their probability to be a foreground object and 
-the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
-could be used to train detection net.
-
-Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
-of anchors, H and W are height and width of the feature map.
-BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
+This operator Generate bounding box proposals for Faster RCNN.
+The propoasls are generated for a list of images based on image
+score 'Scores', bounding box regression result 'BboxDeltas' as
+well as predefined bounding box shapes 'anchors'. Greedy
+non-maximum suppression is applied to generate the final bounding
+boxes.
 
-For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and 
- calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. 
-Finally, apply nms to get final proposals as output.
 )DOC");
   }
 };
@@ -490,6 +503,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
                   ops::GenerateProposalsOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    generate_proposals,
-    ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
+                       ops::GenerateProposalsKernel<double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6146ff509d768c0317a5c65ed22af1a3075977a2..91213b3c4d9db54469ec151ff1dd8e56c3118fea 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -16,10 +16,13 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "cub/cub.cuh"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -36,36 +39,38 @@ namespace {
 
 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
 
-template <typename T>
-__global__ void RangeInitKernel(const T start, const T delta, const int size,
-                                T *out) {
-  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
-}
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
+
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int *out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
 
 template <typename T>
-void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
-                    Tensor *value_out, Tensor *index_out) {
-  int num = value.numel();
+static void SortDescending(const platform::CUDADeviceContext &ctx,
+                           const Tensor &value, Tensor *value_out,
+                           Tensor *index_out) {
+  int num = static_cast<int>(value.numel());
   Tensor index_in_t;
   int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
-  int block = 512;
-  auto stream = ctx.stream();
-  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
+  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
+  for_range(RangeInitFunctor{0, 1, idx_in});
+
   int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
 
   const T *keys_in = value.data<T>();
   T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
 
   // Determine temporary device storage requirements
-  void *d_temp_storage = NULL;
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
-      num);
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
 
   // Allocate temporary storage
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  void *d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
@@ -76,22 +81,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
 }
 
 template <typename T>
-__device__ __forceinline__ T Min(T x, T y) {
-  return x < y ? x : y;
-}
-
-template <typename T>
-__device__ __forceinline__ T Max(T x, T y) {
-  return x > y ? x : y;
-}
-
-template <typename T>
-__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
-                                       const T *var, const int *index,
-                                       const T *im_info, const int num,
-                                       T *proposals) {
-  T kBBoxClipDefault = log(1000.0 / 16.0);
-  CUDA_1D_KERNEL_LOOP(i, num) {
+struct BoxDecodeAndClipFunctor {
+  const T *anchor;
+  const T *deltas;
+  const T *var;
+  const int *index;
+  const T *im_info;
+
+  T *proposals;
+
+  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
+                          const int *index, const T *im_info, T *proposals)
+      : anchor(anchor),
+        deltas(deltas),
+        var(var),
+        index(index),
+        im_info(im_info),
+        proposals(proposals) {}
+
+  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
+
+  __device__ void operator()(size_t i) {
     int k = index[i] * 4;
     T axmin = anchor[k];
     T aymin = anchor[k + 1];
@@ -108,17 +118,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
     T dxmax = deltas[k + 2];
     T dymax = deltas[k + 3];
 
-    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    T d_cx, d_cy, d_w, d_h;
     if (var) {
       d_cx = cx + dxmin * w * var[k];
       d_cy = cy + dymin * h * var[k + 1];
-      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
-      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+      d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
+      d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
     } else {
       d_cx = cx + dxmin * w;
       d_cy = cy + dymin * h;
-      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
-      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+      d_w = exp(Min(dxmax, bbox_clip_default)) * w;
+      d_h = exp(Min(dymax, bbox_clip_default)) * h;
     }
 
     T oxmin = d_cx - d_w * 0.5;
@@ -126,17 +136,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
     T oxmax = d_cx + d_w * 0.5 - 1.;
     T oymax = d_cy + d_h * 0.5 - 1.;
 
-    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
-    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
   }
-}
+
+  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
+
+  __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
+};
 
 template <typename T, int BlockSize>
-__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
-                             const T min_size, const int num, int *keep_num,
-                             int *keep) {
+static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                                    const T min_size, const int num,
+                                    int *keep_num, int *keep) {
   T im_h = im_info[0];
   T im_w = im_info[1];
   T im_scale = im_info[2];
@@ -181,7 +195,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
   }
 }
 
-__device__ inline float IoU(const float *a, const float *b) {
+static __device__ inline float IoU(const float *a, const float *b) {
   float left = max(a[0], b[0]), right = min(a[2], b[2]);
   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
@@ -191,8 +205,9 @@ __device__ inline float IoU(const float *a, const float *b) {
   return inter_s / (s_a + s_b - inter_s);
 }
 
-__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
-                          const float *dev_boxes, uint64_t *dev_mask) {
+static __global__ void NMSKernel(const int n_boxes,
+                                 const float nms_overlap_thresh,
+                                 const float *dev_boxes, uint64_t *dev_mask) {
   const int row_start = blockIdx.y;
   const int col_start = blockIdx.x;
 
@@ -234,9 +249,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
 }
 
 template <typename T>
-void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
-         const Tensor &sorted_indices, const T nms_threshold,
-         Tensor *keep_out) {
+static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+                const Tensor &sorted_indices, const T nms_threshold,
+                Tensor *keep_out) {
   int boxes_num = proposals.dims()[0];
   PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
 
@@ -247,13 +262,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
 
   const T *boxes = proposals.data<T>();
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
-  uint64_t *d_mask =
-      reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
-  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
-  uint64_t *h_mask = reinterpret_cast<uint64_t *>(
-      memory::Alloc(platform::CPUPlace(), size_bytes));
-  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
+  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
+  NMSKernel<<<blocks, threads>>>(
+      boxes_num, nms_threshold, boxes,
+      mask.CUDAMutableData(boost::get<platform::CUDAPlace>(ctx.GetPlace())));
 
   std::vector<uint64_t> remv(col_blocks);
   memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
@@ -267,7 +279,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
     if (!(remv[nblock] & (1ULL << inblock))) {
       ++num_to_keep;
       keep_vec.push_back(i);
-      uint64_t *p = &h_mask[0] + i * col_blocks;
+      uint64_t *p = &mask[0] + i * col_blocks;
       for (int j = nblock; j < col_blocks; j++) {
         remv[j] |= p[j];
       }
@@ -276,12 +288,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
   int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
   memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
                sizeof(int) * num_to_keep, 0);
-  memory::Free(place, d_mask);
-  memory::Free(platform::CPUPlace(), h_mask);
 }
 
 template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
+static std::pair<Tensor, Tensor> ProposalForOneImage(
     const platform::CUDADeviceContext &ctx, const Tensor &im_info,
     const Tensor &anchors, const Tensor &variances,
     const Tensor &bbox_deltas,  // [M, 4]
@@ -300,18 +310,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   // 2. box decode and clipping
   Tensor proposals;
   proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
-  int block = 512;
-  auto stream = ctx.stream();
-  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
-      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
-      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
-      proposals.data<T>());
+
+  {
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
+    for_range(BoxDecodeAndClipFunctor<T>{
+        anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+        index_sort.data<int>(), im_info.data<T>(), proposals.data<T>()});
+  }
 
   // 3. filter
   Tensor keep_index, keep_num_t;
   keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
   keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
   min_size = std::max(min_size, 1.0f);
+  auto stream = ctx.stream();
   FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
       proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
       keep_num_t.data<int>(), keep_index.data<int>());
@@ -355,8 +367,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     auto *scores = context.Input<Tensor>("Scores");
     auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
     auto *im_info = context.Input<Tensor>("ImInfo");
-    auto *anchors = context.Input<Tensor>("Anchors");
-    auto *variances = context.Input<Tensor>("Variances");
+    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
+                               "Cannot find input Anchors(%s) in scope",
+                               context.Inputs("Anchors")[0]);
+    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
+                                 "Cannot find input Variances(%s) in scope",
+                                 context.Inputs("Variances")[0]);
 
     auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
     auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -392,10 +408,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
 
-    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
-    anchor->Resize({anchors->numel() / 4, 4});
-    Tensor *var = const_cast<framework::Tensor *>(variances);
-    var->Resize({var->numel() / 4, 4});
+    anchors.Resize({anchors.numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
 
     rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
                               context.GetPlace());
@@ -417,12 +431,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
       std::pair<Tensor, Tensor> box_score_pair =
-          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, anchors, variances,
                                  bbox_deltas_slice, scores_slice, pre_nms_top_n,
                                  post_nms_top_n, nms_thresh, min_size, eta);
 
-      Tensor proposals = box_score_pair.first;
-      Tensor scores = box_score_pair.second;
+      Tensor &proposals = box_score_pair.first;
+      Tensor &scores = box_score_pair.second;
 
       memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
                    proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c0823c0487d39eece5be08322e7d182b931ba3c
--- /dev/null
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -0,0 +1,2201 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file src/gpc.cpp
+ * @author huhan02(com@baidu.com)
+ * @date 2015/12/18 14:17:30
+ * @brief
+ *
+ * @modified by sunyipeng
+ * @email sunyipeng@baidu.com
+ * @date 2018/6/12
+ **/
+
+#include "paddle/fluid/operators/detection/gpc.h"
+
+namespace gpc {
+
+typedef struct lmt_shape { /* Local minima table                */
+  double y;                /* Y coordinate at local minimum     */
+  edge_node *first_bound;  /* Pointer to bound list             */
+  struct lmt_shape *next;  /* Pointer to next local minimum     */
+} lmt_node;
+
+typedef struct sbt_t_shape { /* Scanbeam tree                     */
+  double y;                  /* Scanbeam node y value             */
+  struct sbt_t_shape *less;  /* Pointer to nodes with lower y     */
+  struct sbt_t_shape *more;  /* Pointer to nodes with higher y    */
+} sb_tree;
+
+typedef struct it_shape { /* Intersection table                */
+  edge_node *ie[2];       /* Intersecting edge (bundle) pair   */
+  gpc_vertex point;       /* Point of intersection             */
+  struct it_shape *next;  /* The next intersection table node  */
+} it_node;
+
+typedef struct st_shape { /* Sorted edge table                 */
+  edge_node *edge;        /* Pointer to AET edge               */
+  double xb;              /* Scanbeam bottom x coordinate      */
+  double xt;              /* Scanbeam top x coordinate         */
+  double dx;              /* Change in x for a unit y increase */
+  struct st_shape *prev;  /* Previous edge in sorted list      */
+} st_node;
+
+typedef struct bbox_shape { /* Contour axis-aligned bounding box */
+  double xmin;              /* Minimum x coordinate              */
+  double ymin;              /* Minimum y coordinate              */
+  double xmax;              /* Maximum x coordinate              */
+  double ymax;              /* Maximum y coordinate              */
+} bbox;
+
+/*
+===========================================================================
+                               Global Data
+===========================================================================
+*/
+
+/* Horizontal edge state transitions within scanbeam boundary */
+const h_state next_h_state[3][6] = {
+    /*        ABOVE     BELOW     CROSS */
+    /*        L   R     L   R     L   R */
+    /* NH */
+    {BH, TH, TH, BH, NH, NH},
+    /* BH */
+    {NH, NH, NH, NH, TH, TH},
+    /* TH */
+    {NH, NH, NH, NH, BH, BH}};
+
+/*
+===========================================================================
+                             Private Functions
+===========================================================================
+*/
+
+static void reset_it(it_node **it) {
+  it_node *itn;
+
+  while (*it) {
+    itn = (*it)->next;
+    gpc_free<it_node>(*it);
+    *it = itn;
+  }
+}
+
+static void reset_lmt(lmt_node **lmt) {
+  lmt_node *lmtn;
+
+  while (*lmt) {
+    lmtn = (*lmt)->next;
+    gpc_free<lmt_node>(*lmt);
+    *lmt = lmtn;
+  }
+}
+
+static void insert_bound(edge_node **b, edge_node *e) {
+  edge_node *existing_bound = NULL;
+
+  if (!*b) {
+    /* Link node e to the tail of the list */
+    *b = e;
+  } else {
+    /* Do primary sort on the x field */
+    if (e[0].bot.x < (*b)[0].bot.x) {
+      /* Insert a new node mid-list */
+      existing_bound = *b;
+      *b = e;
+      (*b)->next_bound = existing_bound;
+    } else {
+      if (e[0].bot.x == (*b)[0].bot.x) {
+        /* Do secondary sort on the dx field */
+        if (e[0].dx < (*b)[0].dx) {
+          /* Insert a new node mid-list */
+          existing_bound = *b;
+          *b = e;
+          (*b)->next_bound = existing_bound;
+        } else {
+          /* Head further down the list */
+          insert_bound(&((*b)->next_bound), e);
+        }
+      } else {
+        /* Head further down the list */
+        insert_bound(&((*b)->next_bound), e);
+      }
+    }
+  }
+}
+
+static edge_node **bound_list(lmt_node **lmt, double y) {
+  lmt_node *existing_node;
+
+  if (!*lmt) {
+    /* Add node onto the tail end of the LMT */
+    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
+                         const_cast<char *>("LMT insertion"));
+    (*lmt)->y = y;
+    (*lmt)->first_bound = NULL;
+    (*lmt)->next = NULL;
+    return &((*lmt)->first_bound);
+  } else if (y < (*lmt)->y) {
+    /* Insert a new LMT node before the current node */
+    existing_node = *lmt;
+    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
+                         const_cast<char *>("LMT insertion"));
+    (*lmt)->y = y;
+    (*lmt)->first_bound = NULL;
+    (*lmt)->next = existing_node;
+    return &((*lmt)->first_bound);
+  } else {
+    if (y > (*lmt)->y) {
+      /* Head further up the LMT */
+      return bound_list(&((*lmt)->next), y);
+    } else {
+      /* Use this existing LMT node */
+      return &((*lmt)->first_bound);
+    }
+  }
+}
+
+static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
+  if (!*sbtree) {
+    /* Add a new tree node here */
+    gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
+                        const_cast<char *>("scanbeam tree insertion"));
+    (*sbtree)->y = y;
+    (*sbtree)->less = NULL;
+    (*sbtree)->more = NULL;
+    (*entries)++;
+  } else {
+    if ((*sbtree)->y > y) {
+      /* Head into the 'less' sub-tree */
+      add_to_sbtree(entries, &((*sbtree)->less), y);
+    } else {
+      if ((*sbtree)->y < y) {
+        /* Head into the 'more' sub-tree */
+        add_to_sbtree(entries, &((*sbtree)->more), y);
+      }
+    }
+  }
+}
+
+static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
+  if (sbtree->less) {
+    build_sbt(entries, sbt, sbtree->less);
+  }
+  sbt[*entries] = sbtree->y;
+  (*entries)++;
+  if (sbtree->more) {
+    build_sbt(entries, sbt, sbtree->more);
+  }
+}
+
+static void free_sbtree(sb_tree **sbtree) {
+  if (*sbtree) {
+    free_sbtree(&((*sbtree)->less));
+    free_sbtree(&((*sbtree)->more));
+    gpc_free<sb_tree>(*sbtree);
+  }
+}
+
+static int count_optimal_vertices(gpc_vertex_list c) {
+  int result = 0;
+  int i = 0;
+
+  /* Ignore non-contributing contours */
+  if (c.num_vertices > 0) {
+    for (i = 0; i < c.num_vertices; i++) {
+      /* Ignore superfluous vertices embedded in horizontal edges */
+      if (gpc_optimal(c.vertex, i, c.num_vertices)) {
+        result++;
+      }
+    }
+  }
+  return result;
+}
+
+static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
+                            gpc_polygon *p, int type, gpc_op op) {
+  int c = 0;
+  int i = 0;
+  int min = 0;
+  int max = 0;
+  int num_edges = 0;
+  int v = 0;
+  int num_vertices = 0;
+  int total_vertices = 0;
+  int e_index = 0;
+  edge_node *e = NULL;
+  edge_node *edge_table = NULL;
+
+  for (c = 0; c < p->num_contours; c++) {
+    total_vertices += count_optimal_vertices(p->contour[c]);
+  }
+
+  /* Create the entire input polygon edge table in one go */
+  gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
+                        const_cast<char *>("edge table creation"));
+
+  for (c = 0; c < p->num_contours; c++) {
+    if (p->contour[c].num_vertices < 0) {
+      /* Ignore the non-contributing contour and repair the vertex count */
+      p->contour[c].num_vertices = -p->contour[c].num_vertices;
+    } else {
+      /* Perform contour optimisation */
+      num_vertices = 0;
+      for (i = 0; i < p->contour[c].num_vertices; i++) {
+        if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
+          edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
+          edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
+
+          /* Record vertex in the scanbeam table */
+          add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
+
+          num_vertices++;
+        }
+      }
+
+      /* Do the contour forward pass */
+      for (min = 0; min < num_vertices; min++) {
+        /* If a forward local minimum... */
+        if (gpc_fwd_min(edge_table, min, num_vertices)) {
+          /* Search for the next local maximum... */
+          num_edges = 1;
+          max = gpc_next_index(min, num_vertices);
+          while (gpc_not_fmax(edge_table, max, num_vertices)) {
+            num_edges++;
+            max = gpc_next_index(max, num_vertices);
+          }
+
+          /* Build the next edge list */
+          e = &edge_table[e_index];
+          e_index += num_edges;
+          v = min;
+          e[0].bstate[BELOW] = UNBUNDLED;
+          e[0].bundle[BELOW][CLIP] = 0;
+          e[0].bundle[BELOW][SUBJ] = 0;
+          for (i = 0; i < num_edges; i++) {
+            e[i].xb = edge_table[v].vertex.x;
+            e[i].bot.x = edge_table[v].vertex.x;
+            e[i].bot.y = edge_table[v].vertex.y;
+
+            v = gpc_next_index(v, num_vertices);
+
+            e[i].top.x = edge_table[v].vertex.x;
+            e[i].top.y = edge_table[v].vertex.y;
+            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
+                      (e[i].top.y - e[i].bot.y);
+            e[i].type = type;
+            e[i].outp[ABOVE] = NULL;
+            e[i].outp[BELOW] = NULL;
+            e[i].next = NULL;
+            e[i].prev = NULL;
+            e[i].succ =
+                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
+            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
+            e[i].next_bound = NULL;
+            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
+            e[i].bside[SUBJ] = LEFT;
+          }
+          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
+        }
+      }
+
+      /* Do the contour reverse pass */
+      for (min = 0; min < num_vertices; min++) {
+        /* If a reverse local minimum... */
+        if (gpc_rev_min(edge_table, min, num_vertices)) {
+          /* Search for the previous local maximum... */
+          num_edges = 1;
+          max = gpc_prev_index(min, num_vertices);
+          while (gpc_not_rmax(edge_table, max, num_vertices)) {
+            num_edges++;
+            max = gpc_prev_index(max, num_vertices);
+          }
+
+          /* Build the previous edge list */
+          e = &edge_table[e_index];
+          e_index += num_edges;
+          v = min;
+          e[0].bstate[BELOW] = UNBUNDLED;
+          e[0].bundle[BELOW][CLIP] = 0;
+          e[0].bundle[BELOW][SUBJ] = 0;
+          for (i = 0; i < num_edges; i++) {
+            e[i].xb = edge_table[v].vertex.x;
+            e[i].bot.x = edge_table[v].vertex.x;
+            e[i].bot.y = edge_table[v].vertex.y;
+
+            v = gpc_prev_index(v, num_vertices);
+
+            e[i].top.x = edge_table[v].vertex.x;
+            e[i].top.y = edge_table[v].vertex.y;
+            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
+                      (e[i].top.y - e[i].bot.y);
+            e[i].type = type;
+            e[i].outp[ABOVE] = NULL;
+            e[i].outp[BELOW] = NULL;
+            e[i].next = NULL;
+            e[i].prev = NULL;
+            e[i].succ =
+                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
+            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
+            e[i].next_bound = NULL;
+            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
+            e[i].bside[SUBJ] = LEFT;
+          }
+          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
+        }
+      }
+    }
+  }
+  return edge_table;
+}  // NOLINT
+
+static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
+  if (!*aet) {
+    /* Append edge onto the tail end of the AET */
+    *aet = edge;
+    edge->prev = prev;
+    edge->next = NULL;
+  } else {
+    /* Do primary sort on the xb field */
+    if (edge->xb < (*aet)->xb) {
+      /* Insert edge here (before the AET edge) */
+      edge->prev = prev;
+      edge->next = *aet;
+      (*aet)->prev = edge;
+      *aet = edge;
+    } else {
+      if (edge->xb == (*aet)->xb) {
+        /* Do secondary sort on the dx field */
+        if (edge->dx < (*aet)->dx) {
+          /* Insert edge here (before the AET edge) */
+          edge->prev = prev;
+          edge->next = *aet;
+          (*aet)->prev = edge;
+          *aet = edge;
+        } else {
+          /* Head further into the AET */
+          add_edge_to_aet(&((*aet)->next), edge, *aet);
+        }
+      } else {
+        /* Head further into the AET */
+        add_edge_to_aet(&((*aet)->next), edge, *aet);
+      }
+    }
+  }
+}
+
+static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
+                             double x, double y) {
+  it_node *existing_node;
+
+  if (!*it) {
+    /* Append a new node to the tail of the list */
+    gpc_malloc<it_node>(*it, sizeof(it_node),
+                        const_cast<char *>("IT insertion"));
+    (*it)->ie[0] = edge0;
+    (*it)->ie[1] = edge1;
+    (*it)->point.x = x;
+    (*it)->point.y = y;
+    (*it)->next = NULL;
+  } else {
+    if ((*it)->point.y > y) {
+      /* Insert a new node mid-list */
+      existing_node = *it;
+      gpc_malloc<it_node>(*it, sizeof(it_node),
+                          const_cast<char *>("IT insertion"));
+      (*it)->ie[0] = edge0;
+      (*it)->ie[1] = edge1;
+      (*it)->point.x = x;
+      (*it)->point.y = y;
+      (*it)->next = existing_node;
+    } else {
+      /* Head further down the list */
+      add_intersection(&((*it)->next), edge0, edge1, x, y);
+    }
+  }
+}
+
+static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
+                        double dy) {
+  st_node *existing_node;
+  double den = 0.0;
+  double r = 0.0;
+  double x = 0.0;
+  double y = 0.0;
+
+  if (!*st) {
+    /* Append edge onto the tail end of the ST */
+    gpc_malloc<st_node>(*st, sizeof(st_node),
+                        const_cast<char *>("ST insertion"));
+    (*st)->edge = edge;
+    (*st)->xb = edge->xb;
+    (*st)->xt = edge->xt;
+    (*st)->dx = edge->dx;
+    (*st)->prev = NULL;
+  } else {
+    den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
+
+    /* If new edge and ST edge don't cross */
+    if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
+        (fabs(den) <= DBL_EPSILON)) {
+      /* No intersection - insert edge here (before the ST edge) */
+      existing_node = *st;
+      gpc_malloc<st_node>(*st, sizeof(st_node),
+                          const_cast<char *>("ST insertion"));
+      (*st)->edge = edge;
+      (*st)->xb = edge->xb;
+      (*st)->xt = edge->xt;
+      (*st)->dx = edge->dx;
+      (*st)->prev = existing_node;
+    } else {
+      /* Compute intersection between new edge and ST edge */
+      r = (edge->xb - (*st)->xb) / den;
+      x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
+      y = r * dy;
+
+      /* Insert the edge pointers and the intersection point in the IT */
+      add_intersection(it, (*st)->edge, edge, x, y);
+
+      /* Head further into the ST */
+      add_st_edge(&((*st)->prev), it, edge, dy);
+    }
+  }
+}
+
+static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
+  st_node *st;
+  st_node *stp;
+  edge_node *edge = NULL;
+
+  /* Build intersection table for the current scanbeam */
+  reset_it(it);
+  st = NULL;
+
+  /* Process each AET edge */
+  for (edge = aet; edge; edge = edge->next) {
+    if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
+        edge->bundle[ABOVE][SUBJ]) {
+      add_st_edge(&st, it, edge, dy);
+    }
+  }
+
+  /* Free the sorted edge table */
+  while (st) {
+    stp = st->prev;
+    gpc_free<st_node>(st);
+    st = stp;
+  }
+}
+
+static int count_contours(polygon_node *polygon) {
+  int nc = 0;
+  int nv = 0;
+  vertex_node *v = NULL;
+  vertex_node *nextv = NULL;
+
+  for (nc = 0; polygon; polygon = polygon->next) {
+    if (polygon->active) {
+      /* Count the vertices in the current contour */
+      nv = 0;
+      for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
+        nv++;
+      }
+
+      /* Record valid vertex counts in the active field */
+      if (nv > 2) {
+        polygon->active = nv;
+        nc++;
+      } else {
+        /* Invalid contour: just free the heap */
+        for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
+          nextv = v->next;
+          gpc_free<vertex_node>(v);
+        }
+        polygon->active = 0;
+      }
+    }
+  }
+  return nc;
+}
+
+static void add_left(polygon_node *p, double x, double y) {
+  vertex_node *nv = NULL;
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+
+  /* Add vertex nv to the left end of the polygon's vertex list */
+  nv->next = p->proxy->v[LEFT];
+
+  /* Update proxy->[LEFT] to point to nv */
+  p->proxy->v[LEFT] = nv;
+}
+
+static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
+  polygon_node *target = NULL;
+
+  /* Label contour as a hole */
+  q->proxy->hole = 1;
+
+  if (p->proxy != q->proxy) {
+    /* Assign p's vertex list to the left end of q's list */
+    p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
+    q->proxy->v[LEFT] = p->proxy->v[LEFT];
+
+    /* Redirect any p->proxy references to q->proxy */
+
+    for (target = p->proxy; list; list = list->next) {
+      if (list->proxy == target) {
+        list->active = 0;
+        list->proxy = q->proxy;
+      }
+    }
+  }
+}
+
+static void add_right(polygon_node *p, double x, double y) {
+  vertex_node *nv = NULL;
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+  nv->next = NULL;
+
+  /* Add vertex nv to the right end of the polygon's vertex list */
+  p->proxy->v[RIGHT]->next = nv;
+
+  /* Update proxy->v[RIGHT] to point to nv */
+  p->proxy->v[RIGHT] = nv;
+}
+
+static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
+  polygon_node *target = NULL;
+
+  /* Label contour as external */
+  q->proxy->hole = 0;
+
+  if (p->proxy != q->proxy) {
+    /* Assign p's vertex list to the right end of q's list */
+    q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
+    q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
+
+    /* Redirect any p->proxy references to q->proxy */
+    for (target = p->proxy; list; list = list->next) {
+      if (list->proxy == target) {
+        list->active = 0;
+        list->proxy = q->proxy;
+      }
+    }
+  }
+}
+
+static void add_local_min(polygon_node **p, edge_node *edge, double x,
+                          double y) {
+  polygon_node *existing_min = NULL;
+  vertex_node *nv = NULL;
+
+  existing_min = *p;
+
+  gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
+                           const_cast<char *>("polygon node creation"));
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+  nv->next = NULL;
+
+  /* Initialise proxy to point to p itself */
+  (*p)->proxy = (*p);
+  (*p)->active = 1;
+  (*p)->next = existing_min;
+
+  /* Make v[LEFT] and v[RIGHT] point to new vertex nv */
+  (*p)->v[LEFT] = nv;
+  (*p)->v[RIGHT] = nv;
+
+  /* Assign polygon p to the edge */
+  edge->outp[ABOVE] = *p;
+}
+
+static int count_tristrips(polygon_node *tn) {
+  int total = 0;
+
+  for (total = 0; tn; tn = tn->next) {
+    if (tn->active > 2) {
+      total++;
+    }
+  }
+  return total;
+}
+
+void add_vertex(vertex_node **t, double x, double y) {
+  if (!(*t)) {
+    gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
+                            const_cast<char *>("tristrip vertex creation"));
+    (*t)->x = x;
+    (*t)->y = y;
+    (*t)->next = NULL;
+  } else {
+    /* Head further down the list */
+    add_vertex(&((*t)->next), x, y);
+  }
+}
+
+void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
+  add_vertex(&(e->outp[p]->v[s]), x, y);
+  e->outp[p]->active++;
+}
+
+static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
+                         double y) {
+  if (!(*tn)) {
+    gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
+                             const_cast<char *>("tristrip node creation"));
+    (*tn)->next = NULL;
+    (*tn)->v[LEFT] = NULL;
+    (*tn)->v[RIGHT] = NULL;
+    (*tn)->active = 1;
+    add_vertex(&((*tn)->v[LEFT]), x, y);
+    edge->outp[ABOVE] = *tn;
+  } else {
+    /* Head further down the list */
+    new_tristrip(&((*tn)->next), edge, x, y);
+  }
+}
+
+static bbox *create_contour_bboxes(gpc_polygon *p) {
+  bbox *box;
+  int c = 0;
+  int v = 0;
+
+  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
+                   const_cast<char *>("Bounding box creation"));
+
+  /* Construct contour bounding boxes */
+  for (c = 0; c < p->num_contours; c++) {
+    /* Initialise bounding box extent */
+    box[c].xmin = DBL_MAX;
+    box[c].ymin = DBL_MAX;
+    box[c].xmax = -DBL_MAX;
+    box[c].ymax = -DBL_MAX;
+
+    for (v = 0; v < p->contour[c].num_vertices; v++) {
+      /* Adjust bounding box */
+      if (p->contour[c].vertex[v].x < box[c].xmin) {
+        box[c].xmin = p->contour[c].vertex[v].x;
+      }
+      if (p->contour[c].vertex[v].y < box[c].ymin) {
+        box[c].ymin = p->contour[c].vertex[v].y;
+      }
+      if (p->contour[c].vertex[v].x > box[c].xmax) {
+        box[c].xmax = p->contour[c].vertex[v].x;
+      }
+      if (p->contour[c].vertex[v].y > box[c].ymax) {
+        box[c].ymax = p->contour[c].vertex[v].y;
+      }
+    }
+  }
+  return box;
+}
+
+static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
+  bbox *s_bbox;
+  bbox *c_bbox;
+  int s = 0;
+  int c = 0;
+  int *o_table = NULL;
+  int overlap = 0;
+
+  s_bbox = create_contour_bboxes(subj);
+  c_bbox = create_contour_bboxes(clip);
+
+  gpc_malloc<int>(o_table,
+                  subj->num_contours * clip->num_contours * sizeof(int),
+                  const_cast<char *>("overlap table creation"));
+
+  /* Check all subject contour bounding boxes against clip boxes */
+  for (s = 0; s < subj->num_contours; s++) {
+    for (c = 0; c < clip->num_contours; c++) {
+      o_table[c * subj->num_contours + s] =
+          (!((s_bbox[s].xmax < c_bbox[c].xmin) ||
+             (s_bbox[s].xmin > c_bbox[c].xmax))) &&
+          (!((s_bbox[s].ymax < c_bbox[c].ymin) ||
+             (s_bbox[s].ymin > c_bbox[c].ymax)));
+    }
+  }
+
+  /* For each clip contour, search for any subject contour overlaps */
+  for (c = 0; c < clip->num_contours; c++) {
+    overlap = 0;
+    for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
+      overlap = o_table[c * subj->num_contours + s];
+    }
+
+    if (!overlap) {
+      /* Flag non contributing status by negating vertex count */
+      clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
+    }
+  }
+
+  if (op == GPC_INT) {
+    /* For each subject contour, search for any clip contour overlaps */
+    for (s = 0; s < subj->num_contours; s++) {
+      overlap = 0;
+      for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
+        overlap = o_table[c * subj->num_contours + s];
+      }
+
+      if (!overlap) {
+        /* Flag non contributing status by negating vertex count */
+        subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
+      }
+    }
+  }
+
+  gpc_free<bbox>(s_bbox);
+  gpc_free<bbox>(c_bbox);
+  gpc_free<int>(o_table);
+}
+
+/*
+===========================================================================
+                             Public Functions
+===========================================================================
+*/
+
+void gpc_free_polygon(gpc_polygon *p) {
+  int c = 0;
+
+  for (c = 0; c < p->num_contours; c++) {
+    gpc_free<gpc_vertex>(p->contour[c].vertex);
+  }
+  gpc_free<int>(p->hole);
+  gpc_free<gpc_vertex_list>(p->contour);
+  p->num_contours = 0;
+}
+
+/*
+void gpc_read_polygon(FILE *fp, int read_hole_flags, gpc_polygon *p) {
+  int c = 0;
+  int v = 0;
+
+  fscanf(fp, "%d", &(p->num_contours));
+  gpc_malloc<int>(p->hole, p->num_contours * sizeof(int),
+                  (char *)"hole flag array creation");
+  gpc_malloc<gpc_vertex_list>(p->contour,
+                              p->num_contours * sizeof(gpc_vertex_list),
+                              (char *)"contour creation");
+  for (c = 0; c < p->num_contours; c++) {
+    fscanf(fp, "%d", &(p->contour[c].num_vertices));
+
+    if (read_hole_flags) {
+      fscanf(fp, "%d", &(p->hole[c]));
+    } else {
+      p->hole[c] = 0; // Assume all contours to be external
+    }
+
+    gpc_malloc<gpc_vertex>(p->contour[c].vertex,
+                           p->contour[c].num_vertices * sizeof(gpc_vertex),
+                           (char *)"vertex creation");
+    for (v = 0; v < p->contour[c].num_vertices; v++) {
+      fscanf(fp, "%lf %lf", &(p->contour[c].vertex[v].x),
+             &(p->contour[c].vertex[v].y));
+    }
+  }
+}
+
+void gpc_write_polygon(FILE *fp, int write_hole_flags, gpc_polygon *p) {
+  int c = 0;
+  int v = 0;
+
+  fprintf(fp, "%d\n", p->num_contours);
+  for (c = 0; c < p->num_contours; c++) {
+    fprintf(fp, "%d\n", p->contour[c].num_vertices);
+
+    if (write_hole_flags) {
+      fprintf(fp, "%d\n", p->hole[c]);
+    }
+
+    for (v = 0; v < p->contour[c].num_vertices; v++) {
+      fprintf(fp, "% .*lf % .*lf\n", DBL_DIG, p->contour[c].vertex[v].x,
+              DBL_DIG, p->contour[c].vertex[v].y);
+    }
+  }
+}
+*/
+
+void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
+  int *extended_hole = NULL;
+  int c = 0;
+  int v = 0;
+  gpc_vertex_list *extended_contour = NULL;
+
+  /* Create an extended hole array */
+  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
+                  const_cast<char *>("contour hole addition"));
+
+  /* Create an extended contour array */
+  gpc_malloc<gpc_vertex_list>(extended_contour,
+                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
+                              const_cast<char *>("contour addition"));
+
+  /* Copy the old contour and hole data into the extended arrays */
+  for (c = 0; c < p->num_contours; c++) {
+    extended_hole[c] = p->hole[c];
+    extended_contour[c] = p->contour[c];
+  }
+
+  /* Copy the new contour and hole onto the end of the extended arrays */
+  c = p->num_contours;
+  extended_hole[c] = hole;
+  extended_contour[c].num_vertices = new_contour->num_vertices;
+  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
+                         new_contour->num_vertices * sizeof(gpc_vertex),
+                         const_cast<char *>("contour addition"));
+  for (v = 0; v < new_contour->num_vertices; v++) {
+    extended_contour[c].vertex[v] = new_contour->vertex[v];
+  }
+
+  /* Dispose of the old contour */
+  gpc_free<gpc_vertex_list>(p->contour);
+  gpc_free<int>(p->hole);
+
+  /* Update the polygon information */
+  p->num_contours++;
+  p->hole = extended_hole;
+  p->contour = extended_contour;
+}
+
+// gpc_polygon_clip
+void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
+                      gpc_polygon *result) {
+  sb_tree *sbtree = NULL;
+  it_node *it = NULL;
+  it_node *intersect = NULL;
+  edge_node *edge = NULL;
+  edge_node *prev_edge = NULL;
+  edge_node *next_edge = NULL;
+  edge_node *succ_edge = NULL;
+  edge_node *e0 = NULL;
+  edge_node *e1 = NULL;
+  edge_node *aet = NULL;
+  edge_node *c_heap = NULL;
+  edge_node *s_heap = NULL;
+  lmt_node *lmt = NULL;
+  lmt_node *local_min = NULL;
+  polygon_node *out_poly = NULL;
+  polygon_node *p = NULL;
+  polygon_node *q = NULL;
+  polygon_node *poly = NULL;
+  polygon_node *npoly = NULL;
+  polygon_node *cf = NULL;
+  vertex_node *vtx = NULL;
+  vertex_node *nv = NULL;
+  h_state horiz[2];
+  int in[2];
+  int exists[2];
+  int parity[2] = {LEFT, LEFT};
+  int c = 0;
+  int v = 0;
+  int contributing = 0;
+  int search = 0;
+  int scanbeam = 0;
+  int sbt_entries = 0;
+  int vclass = 0;
+  int bl = 0;
+  int br = 0;
+  int tl = 0;
+  int tr = 0;
+  double *sbt = NULL;
+  double xb = 0.0;
+  double px = 0.0;
+  double yb = 0.0;
+  double yt = 0.0;
+  double dy = 0.0;
+  double ix = 0.0;
+  double iy = 0.0;
+
+  /* Test for trivial NULL result cases */
+  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
+      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
+      ((clip->num_contours == 0) && (op == GPC_INT))) {
+    result->num_contours = 0;
+    result->hole = NULL;
+    result->contour = NULL;
+    return;
+  }
+  /* Identify potentialy contributing contours */
+  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
+      (clip->num_contours > 0)) {
+    minimax_test(subj, clip, op);
+  }
+  /* Build LMT */
+  if (subj->num_contours > 0) {
+    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
+  }
+  if (clip->num_contours > 0) {
+    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
+  }
+  /* Return a NULL result if no contours contribute */
+  if (lmt == NULL) {
+    result->num_contours = 0;
+    result->hole = NULL;
+    result->contour = NULL;
+    reset_lmt(&lmt);
+    gpc_free<edge_node>(s_heap);
+    gpc_free<edge_node>(c_heap);
+    return;
+  }
+
+  /* Build scanbeam table from scanbeam tree */
+  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
+                     const_cast<char *>("sbt creation"));
+  build_sbt(&scanbeam, sbt, sbtree);
+  scanbeam = 0;
+  free_sbtree(&sbtree);
+  /* Allow pointer re-use without causing memory leak */
+  if (subj == result) {
+    gpc_free_polygon(subj);
+  }
+  if (clip == result) {
+    gpc_free_polygon(clip);
+  }
+  /* Invert clip polygon for difference operation */
+  if (op == GPC_DIFF) {
+    parity[CLIP] = RIGHT;
+  }
+  local_min = lmt;
+
+  // Process each scanbeam
+  while (scanbeam < sbt_entries) {
+    /* Set yb and yt to the bottom and top of the scanbeam */
+    yb = sbt[scanbeam++];
+    if (scanbeam < sbt_entries) {
+      yt = sbt[scanbeam];
+      dy = yt - yb;
+    }
+    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
+    /* If LMT node corresponding to yb exists */
+    if (local_min) {
+      if (local_min->y == yb) {
+        /* Add edges starting at this local minimum to the AET */
+        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
+          add_edge_to_aet(&aet, edge, NULL);
+        }
+        local_min = local_min->next;
+      }
+    }
+    /* Set dummy previous x value */
+    px = -DBL_MAX;
+    /* Create bundles within AET */
+    e0 = aet;
+    e1 = aet;
+    /* Set up bundle fields of first edge */
+    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
+    aet->bundle[ABOVE][!aet->type] = 0;
+    aet->bstate[ABOVE] = UNBUNDLED;
+
+    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
+      /* Set up bundle fields of next edge */
+      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
+      next_edge->bundle[ABOVE][!next_edge->type] = 0;
+      next_edge->bstate[ABOVE] = UNBUNDLED;
+      /* Bundle edges above the scanbeam boundary if they coincide */
+      if (next_edge->bundle[ABOVE][next_edge->type]) {
+        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
+            (e0->top.y != yb)) {
+          next_edge->bundle[ABOVE][next_edge->type] ^=
+              e0->bundle[ABOVE][next_edge->type];
+          next_edge->bundle[ABOVE][!next_edge->type] =
+              e0->bundle[ABOVE][!next_edge->type];
+          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
+          e0->bundle[ABOVE][CLIP] = 0;
+          e0->bundle[ABOVE][SUBJ] = 0;
+          e0->bstate[ABOVE] = BUNDLE_TAIL;
+        }
+        e0 = next_edge;
+      }
+    }
+    horiz[CLIP] = NH;
+    horiz[SUBJ] = NH;
+
+    // Process each edge at this scanbeam boundary
+    for (edge = aet; edge; edge = edge->next) {
+      exists[CLIP] =
+          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
+      exists[SUBJ] =
+          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
+      if (exists[CLIP] || exists[SUBJ]) {
+        /* Set bundle side */
+        edge->bside[CLIP] = parity[CLIP];
+        edge->bside[SUBJ] = parity[SUBJ];
+        /* Determine contributing status and quadrant occupancies */
+        switch (op) {
+          case GPC_DIFF:
+          case GPC_INT:
+            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) && (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_XOR:
+            contributing = exists[CLIP] || exists[SUBJ];
+            br = (parity[CLIP]) ^ (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_UNION:
+            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) || (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+        }
+        // Update parity
+        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
+        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
+        /* Update horizontal state */
+        if (exists[CLIP]) {
+          horiz[CLIP] = next_h_state[horiz[CLIP]]
+                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
+        }
+        if (exists[SUBJ]) {
+          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
+                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
+        }
+        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+        if (contributing) {
+          xb = edge->xb;
+          switch (vclass) {
+            case EMN:
+            case IMN:
+              add_local_min(&out_poly, edge, xb, yb);
+              px = xb;
+              cf = edge->outp[ABOVE];
+              break;
+            case ERI:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              edge->outp[ABOVE] = cf;
+              cf = NULL;
+              break;
+            case ELI:
+              add_left(edge->outp[BELOW], xb, yb);
+              px = xb;
+              cf = edge->outp[BELOW];
+              break;
+            case EMX:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              merge_right(cf, edge->outp[BELOW], out_poly);
+              cf = NULL;
+              break;
+            case ILI:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              edge->outp[ABOVE] = cf;
+              cf = NULL;
+              break;
+            case IRI:
+              add_right(edge->outp[BELOW], xb, yb);
+              px = xb;
+              cf = edge->outp[BELOW];
+              edge->outp[BELOW] = NULL;
+              break;
+            case IMX:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              merge_left(cf, edge->outp[BELOW], out_poly);
+              cf = NULL;
+              edge->outp[BELOW] = NULL;
+              break;
+            case IMM:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              merge_left(cf, edge->outp[BELOW], out_poly);
+              edge->outp[BELOW] = NULL;
+              add_local_min(&out_poly, edge, xb, yb);
+              cf = edge->outp[ABOVE];
+              break;
+            case EMM:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              merge_right(cf, edge->outp[BELOW], out_poly);
+              edge->outp[BELOW] = NULL;
+              add_local_min(&out_poly, edge, xb, yb);
+              cf = edge->outp[ABOVE];
+              break;
+            case LED:
+              if (edge->bot.y == yb) {
+                add_left(edge->outp[BELOW], xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              px = xb;
+              break;
+            case RED:
+              if (edge->bot.y == yb) {
+                add_right(edge->outp[BELOW], xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              px = xb;
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing conditional */
+      }     /* End of edge exists conditional */
+    }       // End of AET loop
+
+    /* Delete terminating edges from the AET, otherwise compute xt */
+    for (edge = aet; edge; edge = edge->next) {
+      if (edge->top.y == yb) {
+        prev_edge = edge->prev;
+        next_edge = edge->next;
+        if (prev_edge) {
+          prev_edge->next = next_edge;
+        } else {
+          aet = next_edge;
+        }
+        if (next_edge) {
+          next_edge->prev = prev_edge;
+        }
+        /* Copy bundle head state to the adjacent tail edge if required */
+        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
+          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
+            prev_edge->outp[BELOW] = edge->outp[BELOW];
+            prev_edge->bstate[BELOW] = UNBUNDLED;
+            if (prev_edge->prev) {
+              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
+                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
+              }
+            }
+          }
+        }
+      } else {
+        if (edge->top.y == yt) {
+          edge->xt = edge->top.x;
+        } else {
+          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
+        }
+      }
+    }
+
+    if (scanbeam < sbt_entries) {
+      /* === SCANBEAM INTERIOR PROCESSING ============================== */
+      build_intersection_table(&it, aet, dy);
+      /* Process each node in the intersection table */
+      for (intersect = it; intersect; intersect = intersect->next) {
+        e0 = intersect->ie[0];
+        e1 = intersect->ie[1];
+        /* Only generate output for contributing intersections */
+        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
+            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
+          p = e0->outp[ABOVE];
+          q = e1->outp[ABOVE];
+          ix = intersect->point.x;
+          iy = intersect->point.y + yb;
+
+          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
+                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
+                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
+                      e0->bside[CLIP] && e1->bside[CLIP]);
+          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
+                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
+                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
+                      e0->bside[SUBJ] && e1->bside[SUBJ]);
+
+          // Determine quadrant occupancies
+          switch (op) {
+            case GPC_DIFF:
+            case GPC_INT:
+              tr = (in[CLIP]) && (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_XOR:
+              tr = (in[CLIP]) ^ (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_UNION:
+              tr = (in[CLIP]) || (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+          }
+          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+          switch (vclass) {
+            case EMN:
+              add_local_min(&out_poly, e0, ix, iy);
+              e1->outp[ABOVE] = e0->outp[ABOVE];
+              break;
+            case ERI:
+              if (p) {
+                add_right(p, ix, iy);
+                e1->outp[ABOVE] = p;
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case ELI:
+              if (q) {
+                add_left(q, ix, iy);
+                e0->outp[ABOVE] = q;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case EMX:
+              if (p && q) {
+                add_left(p, ix, iy);
+                merge_right(p, q, out_poly);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMN:
+              add_local_min(&out_poly, e0, ix, iy);
+              e1->outp[ABOVE] = e0->outp[ABOVE];
+              break;
+            case ILI:
+              if (p) {
+                add_left(p, ix, iy);
+                e1->outp[ABOVE] = p;
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case IRI:
+              if (q) {
+                add_right(q, ix, iy);
+                e0->outp[ABOVE] = q;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMX:
+              if (p && q) {
+                add_right(p, ix, iy);
+                merge_left(p, q, out_poly);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMM:
+              if (p && q) {
+                add_right(p, ix, iy);
+                merge_left(p, q, out_poly);
+                add_local_min(&out_poly, e0, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+              }
+              break;
+            case EMM:
+              if (p && q) {
+                add_left(p, ix, iy);
+                merge_right(p, q, out_poly);
+                add_local_min(&out_poly, e0, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+              }
+              break;
+            default:
+              break;
+          }  // End of switch
+        }    /* End of contributing intersection conditional */
+
+        /* Swap bundle sides in response to edge crossing */
+        if (e0->bundle[ABOVE][CLIP]) {
+          e1->bside[CLIP] = !e1->bside[CLIP];
+        }
+        if (e1->bundle[ABOVE][CLIP]) {
+          e0->bside[CLIP] = !e0->bside[CLIP];
+        }
+        if (e0->bundle[ABOVE][SUBJ]) {
+          e1->bside[SUBJ] = !e1->bside[SUBJ];
+        }
+        if (e1->bundle[ABOVE][SUBJ]) {
+          e0->bside[SUBJ] = !e0->bside[SUBJ];
+        }
+
+        /* Swap e0 and e1 bundles in the AET */
+        prev_edge = e0->prev;
+        next_edge = e1->next;
+        if (next_edge) {
+          next_edge->prev = e0;
+        }
+        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
+          search = 1;
+          while (search) {
+            prev_edge = prev_edge->prev;
+            if (prev_edge) {
+              if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
+                search = 0;
+              }
+            } else {
+              search = 0;
+            }
+          }
+        }
+        if (!prev_edge) {
+          aet->prev = e1;
+          e1->next = aet;
+          aet = e0->next;
+        } else {
+          prev_edge->next->prev = e1;
+          e1->next = prev_edge->next;
+          prev_edge->next = e0->next;
+        }
+        e0->next->prev = prev_edge;
+        e1->next->prev = e1;
+        e0->next = next_edge;
+      } /* End of IT loop*/
+
+      // Prepare for next scanbeam
+      for (edge = aet; edge; edge = next_edge) {
+        next_edge = edge->next;
+        succ_edge = edge->succ;
+        if ((edge->top.y == yt) && succ_edge) {
+          /* Replace AET edge by its successor */
+          succ_edge->outp[BELOW] = edge->outp[ABOVE];
+          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
+          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          prev_edge = edge->prev;
+          if (prev_edge) {
+            prev_edge->next = succ_edge;
+          } else {
+            aet = succ_edge;
+          }
+          if (next_edge) {
+            next_edge->prev = succ_edge;
+          }
+          succ_edge->prev = prev_edge;
+          succ_edge->next = next_edge;
+        } else {
+          /* Update this edge */
+          edge->outp[BELOW] = edge->outp[ABOVE];
+          edge->bstate[BELOW] = edge->bstate[ABOVE];
+          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          edge->xb = edge->xt;
+        }
+        edge->outp[ABOVE] = NULL;
+      }
+    }
+  } /* === END OF SCANBEAM PROCESSING ================================== */
+  // Generate result polygon from out_poly
+  result->contour = NULL;
+  result->hole = NULL;
+  result->num_contours = count_contours(out_poly);
+  if (result->num_contours > 0) {
+    gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
+                    const_cast<char *>("hole flag table creation"));
+    gpc_malloc<gpc_vertex_list>(result->contour,
+                                result->num_contours * sizeof(gpc_vertex_list),
+                                const_cast<char *>("contour creation"));
+
+    c = 0;
+    for (poly = out_poly; poly; poly = npoly) {
+      npoly = poly->next;
+      if (poly->active) {
+        result->hole[c] = poly->proxy->hole;
+        result->contour[c].num_vertices = poly->active;
+        gpc_malloc<gpc_vertex>(
+            result->contour[c].vertex,
+            result->contour[c].num_vertices * sizeof(gpc_vertex),
+            const_cast<char *>("vertex creation"));
+
+        v = result->contour[c].num_vertices - 1;
+        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
+          nv = vtx->next;
+          result->contour[c].vertex[v].x = vtx->x;
+          result->contour[c].vertex[v].y = vtx->y;
+          gpc_free<vertex_node>(vtx);
+          v--;
+        }
+        c++;
+      }
+      gpc_free<polygon_node>(poly);
+    }
+  } else {
+    for (poly = out_poly; poly; poly = npoly) {
+      npoly = poly->next;
+      gpc_free<polygon_node>(poly);
+    }
+  }
+
+  // Tidy up
+  reset_it(&it);
+  reset_lmt(&lmt);
+  gpc_free<edge_node>(c_heap);
+  gpc_free<edge_node>(s_heap);
+  gpc_free<double>(sbt);
+}  // NOLINT
+
+void gpc_free_tristrip(gpc_tristrip *t) {
+  int s = 0;
+  for (s = 0; s < t->num_strips; s++) {
+    gpc_free<gpc_vertex>(t->strip[s].vertex);
+  }
+  gpc_free<gpc_vertex_list>(t->strip);
+  t->num_strips = 0;
+}
+
+void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
+  gpc_polygon c;
+  c.num_contours = 0;
+  c.hole = NULL;
+  c.contour = NULL;
+  gpc_tristrip_clip(GPC_DIFF, s, &c, t);
+}
+
+// gpc_tristrip_clip
+void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
+                       gpc_tristrip *result) {
+  sb_tree *sbtree = NULL;
+  it_node *it = NULL;
+  it_node *intersect = NULL;
+  edge_node *edge = NULL;
+  edge_node *prev_edge = NULL;
+  edge_node *next_edge = NULL;
+  edge_node *succ_edge = NULL;
+  edge_node *e0 = NULL;
+  edge_node *e1 = NULL;
+  edge_node *aet = NULL;
+  edge_node *c_heap = NULL;
+  edge_node *s_heap = NULL;
+  edge_node *cf = NULL;
+  lmt_node *lmt = NULL;
+  lmt_node *local_min = NULL;
+  polygon_node *tlist = NULL;
+  polygon_node *tn = NULL;
+  polygon_node *tnn = NULL;
+  polygon_node *p = NULL;
+  polygon_node *q = NULL;
+  vertex_node *lt = NULL;
+  vertex_node *ltn = NULL;
+  vertex_node *rt = NULL;
+  vertex_node *rtn = NULL;
+  h_state horiz[2];
+  vertex_type cft = NUL;
+  int in[2];
+  int exists[2];
+  int parity[2] = {LEFT, LEFT};
+  int s = 0;
+  int v = 0;
+  int contributing = 0;
+  int search = 0;
+  int scanbeam = 0;
+  int sbt_entries = 0;
+  int vclass = 0;
+  int bl = 0;
+  int br = 0;
+  int tl = 0;
+  int tr = 0;
+  double *sbt = NULL;
+  double xb = 0.0;
+  double px = 0.0;
+  double nx = 0.0;
+  double yb = 0.0;
+  double yt = 0.0;
+  double dy = 0.0;
+  double ix = 0.0;
+  double iy = 0.0;
+
+  /* Test for trivial NULL result cases */
+  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
+      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
+      ((clip->num_contours == 0) && (op == GPC_INT))) {
+    result->num_strips = 0;
+    result->strip = NULL;
+    return;
+  }
+
+  /* Identify potentialy contributing contours */
+  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
+      (clip->num_contours > 0)) {
+    minimax_test(subj, clip, op);
+  }
+  /* Build LMT */
+  if (subj->num_contours > 0) {
+    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
+  }
+  if (clip->num_contours > 0) {
+    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
+  }
+  /* Return a NULL result if no contours contribute */
+  if (lmt == NULL) {
+    result->num_strips = 0;
+    result->strip = NULL;
+    reset_lmt(&lmt);
+    gpc_free<edge_node>(s_heap);
+    gpc_free<edge_node>(c_heap);
+    return;
+  }
+
+  /* Build scanbeam table from scanbeam tree */
+  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
+                     const_cast<char *>("sbt creation"));
+  build_sbt(&scanbeam, sbt, sbtree);
+  scanbeam = 0;
+  free_sbtree(&sbtree);
+
+  /* Invert clip polygon for difference operation */
+  if (op == GPC_DIFF) {
+    parity[CLIP] = RIGHT;
+  }
+  local_min = lmt;
+
+  // Process each scanbeam
+  while (scanbeam < sbt_entries) {
+    /* Set yb and yt to the bottom and top of the scanbeam */
+    yb = sbt[scanbeam++];
+    if (scanbeam < sbt_entries) {
+      yt = sbt[scanbeam];
+      dy = yt - yb;
+    }
+
+    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
+    /* If LMT node corresponding to yb exists */
+    if (local_min) {
+      if (local_min->y == yb) {
+        /* Add edges starting at this local minimum to the AET */
+        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
+          add_edge_to_aet(&aet, edge, NULL);
+        }
+        local_min = local_min->next;
+      }
+    }
+    /* Set dummy previous x value */
+    /* Create bundles within AET */
+    px = -DBL_MAX;
+    e0 = aet;
+    e1 = aet;
+
+    /* Set up bundle fields of first edge */
+    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
+    aet->bundle[ABOVE][!aet->type] = 0;
+    aet->bstate[ABOVE] = UNBUNDLED;
+
+    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
+      /* Set up bundle fields of next edge */
+      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
+      next_edge->bundle[ABOVE][!next_edge->type] = 0;
+      next_edge->bstate[ABOVE] = UNBUNDLED;
+
+      /* Bundle edges above the scanbeam boundary if they coincide */
+      if (next_edge->bundle[ABOVE][next_edge->type]) {
+        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
+            (e0->top.y != yb)) {
+          next_edge->bundle[ABOVE][next_edge->type] ^=
+              e0->bundle[ABOVE][next_edge->type];
+          next_edge->bundle[ABOVE][!next_edge->type] =
+              e0->bundle[ABOVE][!next_edge->type];
+          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
+          e0->bundle[ABOVE][CLIP] = 0;
+          e0->bundle[ABOVE][SUBJ] = 0;
+          e0->bstate[ABOVE] = BUNDLE_TAIL;
+        }
+        e0 = next_edge;
+      }
+    }
+    horiz[CLIP] = NH;
+    horiz[SUBJ] = NH;
+
+    /* Process each edge at this scanbeam boundary */
+    for (edge = aet; edge; edge = edge->next) {
+      exists[CLIP] =
+          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
+      exists[SUBJ] =
+          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
+
+      if (exists[CLIP] || exists[SUBJ]) {
+        /* Set bundle side */
+        edge->bside[CLIP] = parity[CLIP];
+        edge->bside[SUBJ] = parity[SUBJ];
+
+        /* Determine contributing status and quadrant occupancies */
+        switch (op) {
+          case GPC_DIFF:
+          case GPC_INT:
+            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) && (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_XOR:
+            contributing = exists[CLIP] || exists[SUBJ];
+            br = (parity[CLIP]) ^ (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_UNION:
+            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) || (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+        }
+
+        // Update parity
+        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
+        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
+
+        /* Update horizontal state */
+        if (exists[CLIP]) {
+          horiz[CLIP] = next_h_state[horiz[CLIP]]
+                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
+        }
+        if (exists[SUBJ]) {
+          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
+                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
+        }
+        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+
+        if (contributing) {
+          xb = edge->xb;
+          switch (vclass) {
+            case EMN:
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              break;
+            case ERI:
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (xb != cf->xb) {
+                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              }
+              cf = NULL;
+              break;
+            case ELI:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              cf = edge;
+              break;
+            case EMX:
+              if (xb != cf->xb) {
+                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              }
+              edge->outp[ABOVE] = NULL;
+              cf = NULL;
+              break;
+            case IMN:
+              if (cft == LED) {
+                if (cf->bot.y != yb) {
+                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                }
+                new_tristrip(&tlist, cf, cf->xb, yb);
+              }
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              break;
+            case ILI:
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              cft = ILI;
+              break;
+            case IRI:
+              if (cft == LED) {
+                if (cf->bot.y != yb) {
+                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                }
+                new_tristrip(&tlist, cf, cf->xb, yb);
+              }
+              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              break;
+            case IMX:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              cft = IMX;
+              break;
+            case IMM:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (xb != cf->xb) {
+                gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
+              }
+              cf = edge;
+              break;
+            case EMM:
+              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              break;
+            case LED:
+              if (edge->bot.y == yb) {
+                gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              cf = edge;
+              cft = LED;
+              break;
+            case RED:
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (cft == LED) {
+                if (cf->bot.y == yb) {
+                  gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                } else {
+                  if (edge->bot.y == yb) {
+                    gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                    gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                  }
+                }
+              } else {
+                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              }
+              cf = NULL;
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing conditional */
+      }     /* End of edge exists conditional */
+    }       // End of AET loop
+
+    /* Delete terminating edges from the AET, otherwise compute xt */
+    for (edge = aet; edge; edge = edge->next) {
+      if (edge->top.y == yb) {
+        prev_edge = edge->prev;
+        next_edge = edge->next;
+        if (prev_edge) {
+          prev_edge->next = next_edge;
+        } else {
+          aet = next_edge;
+        }
+        if (next_edge) {
+          next_edge->prev = prev_edge;
+        }
+
+        /* Copy bundle head state to the adjacent tail edge if required */
+        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
+          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
+            prev_edge->outp[BELOW] = edge->outp[BELOW];
+            prev_edge->bstate[BELOW] = UNBUNDLED;
+            if (prev_edge->prev) {
+              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
+                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
+              }
+            }
+          }
+        }
+      } else {
+        if (edge->top.y == yt) {
+          edge->xt = edge->top.x;
+        } else {
+          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
+        }
+      }
+    }
+
+    if (scanbeam < sbt_entries) {
+      /* === SCANBEAM INTERIOR PROCESSING ============================== */
+      build_intersection_table(&it, aet, dy);
+      /* Process each node in the intersection table */
+      for (intersect = it; intersect; intersect = intersect->next) {
+        e0 = intersect->ie[0];
+        e1 = intersect->ie[1];
+
+        /* Only generate output for contributing intersections */
+        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
+            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
+          p = e0->outp[ABOVE];
+          q = e1->outp[ABOVE];
+          ix = intersect->point.x;
+          iy = intersect->point.y + yb;
+
+          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
+                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
+                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
+                      e0->bside[CLIP] && e1->bside[CLIP]);
+          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
+                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
+                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
+                      e0->bside[SUBJ] && e1->bside[SUBJ]);
+
+          switch (op) {  // Determine quadrant occupancies
+            case GPC_DIFF:
+            case GPC_INT:
+              tr = (in[CLIP]) && (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_XOR:
+              tr = (in[CLIP]) ^ (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_UNION:
+              tr = (in[CLIP]) || (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+          }
+
+          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+          switch (vclass) {
+            case EMN:
+              new_tristrip(&tlist, e1, ix, iy);
+              e0->outp[ABOVE] = e1->outp[ABOVE];
+              break;
+            case ERI:
+              if (p) {
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case ELI:
+              if (q) {
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case EMX:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMN:
+              gpc_p_edge(prev_edge, e0, ABOVE);
+              gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+              gpc_n_edge(next_edge, e1, ABOVE);
+              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              new_tristrip(&tlist, prev_edge, px, iy);
+              e1->outp[ABOVE] = prev_edge->outp[ABOVE];
+              gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+              new_tristrip(&tlist, e0, ix, iy);
+              next_edge->outp[ABOVE] = e0->outp[ABOVE];
+              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              break;
+            case ILI:
+              if (p) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case IRI:
+              if (q) {
+                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMX:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                new_tristrip(&tlist, prev_edge, px, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              }
+              break;
+            case IMM:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                new_tristrip(&tlist, prev_edge, px, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e1->outp[ABOVE] = prev_edge->outp[ABOVE];
+                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+                new_tristrip(&tlist, e0, ix, iy);
+                next_edge->outp[ABOVE] = e0->outp[ABOVE];
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              }
+              break;
+            case EMM:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                new_tristrip(&tlist, e1, ix, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+              }
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing intersection conditional */
+
+        // Swap bundle sides in response to edge crossing
+        if (e0->bundle[ABOVE][CLIP]) {
+          e1->bside[CLIP] = !e1->bside[CLIP];
+        }
+        if (e1->bundle[ABOVE][CLIP]) {
+          e0->bside[CLIP] = !e0->bside[CLIP];
+        }
+        if (e0->bundle[ABOVE][SUBJ]) {
+          e1->bside[SUBJ] = !e1->bside[SUBJ];
+        }
+        if (e1->bundle[ABOVE][SUBJ]) {
+          e0->bside[SUBJ] = !e0->bside[SUBJ];
+        }
+
+        /* Swap e0 and e1 bundles in the AET */
+        prev_edge = e0->prev;
+        next_edge = e1->next;
+        if (e1->next) {
+          e1->next->prev = e0;
+        }
+
+        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
+          search = 1;
+          while (search) {
+            prev_edge = prev_edge->prev;
+            if (prev_edge) {
+              if (prev_edge->bundle[ABOVE][CLIP] ||
+                  prev_edge->bundle[ABOVE][SUBJ] ||
+                  (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
+                search = 0;
+              }
+            } else {
+              search = 0;
+            }
+          }
+        }
+        if (!prev_edge) {
+          e1->next = aet;
+          aet = e0->next;
+        } else {
+          e1->next = prev_edge->next;
+          prev_edge->next = e0->next;
+        }
+        e0->next->prev = prev_edge;
+        e1->next->prev = e1;
+        e0->next = next_edge;
+      } /* End of IT loop*/
+
+      /* Prepare for next scanbeam */
+      for (edge = aet; edge; edge = next_edge) {
+        next_edge = edge->next;
+        succ_edge = edge->succ;
+
+        if ((edge->top.y == yt) && succ_edge) {
+          /* Replace AET edge by its successor */
+          succ_edge->outp[BELOW] = edge->outp[ABOVE];
+          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
+          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          prev_edge = edge->prev;
+          if (prev_edge) {
+            prev_edge->next = succ_edge;
+          } else {
+            aet = succ_edge;
+          }
+          if (next_edge) {
+            next_edge->prev = succ_edge;
+          }
+          succ_edge->prev = prev_edge;
+          succ_edge->next = next_edge;
+        } else {
+          /* Update this edge */
+          edge->outp[BELOW] = edge->outp[ABOVE];
+          edge->bstate[BELOW] = edge->bstate[ABOVE];
+          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          edge->xb = edge->xt;
+        }
+        edge->outp[ABOVE] = NULL;
+      }
+    }
+  } /* === END OF SCANBEAM PROCESSING ================================== */
+
+  // Generate result tristrip from tlist
+  result->strip = NULL;
+  result->num_strips = count_tristrips(tlist);
+  if (result->num_strips > 0) {
+    gpc_malloc<gpc_vertex_list>(result->strip,
+                                result->num_strips * sizeof(gpc_vertex_list),
+                                const_cast<char *>("tristrip list creation"));
+
+    s = 0;
+    for (tn = tlist; tn; tn = tnn) {
+      tnn = tn->next;
+      if (tn->active > 2) {
+        /* Valid tristrip: copy the vertices and free the heap */
+        result->strip[s].num_vertices = tn->active;
+        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
+                               tn->active * sizeof(gpc_vertex),
+                               const_cast<char *>("tristrip creation"));
+        v = 0;
+        if (0) {
+          lt = tn->v[RIGHT];
+          rt = tn->v[LEFT];
+        } else {
+          lt = tn->v[LEFT];
+          rt = tn->v[RIGHT];
+        }
+        while (lt || rt) {
+          if (lt) {
+            ltn = lt->next;
+            result->strip[s].vertex[v].x = lt->x;
+            result->strip[s].vertex[v].y = lt->y;
+            v++;
+            gpc_free<vertex_node>(lt);
+            lt = ltn;
+          }
+          if (rt) {
+            rtn = rt->next;
+            result->strip[s].vertex[v].x = rt->x;
+            result->strip[s].vertex[v].y = rt->y;
+            v++;
+            gpc_free<vertex_node>(rt);
+            rt = rtn;
+          }
+        }
+        s++;
+      } else {
+        /* Invalid tristrip: just free the heap */
+        for (lt = tn->v[LEFT]; lt; lt = ltn) {
+          ltn = lt->next;
+          gpc_free<vertex_node>(lt);
+        }
+        for (rt = tn->v[RIGHT]; rt; rt = rtn) {
+          rtn = rt->next;
+          gpc_free<vertex_node>(rt);
+        }
+      }
+      gpc_free<polygon_node>(tn);
+    }
+  }
+  // Tidy up
+  reset_it(&it);
+  reset_lmt(&lmt);
+  gpc_free<edge_node>(c_heap);
+  gpc_free<edge_node>(s_heap);
+  gpc_free<double>(sbt);
+}  // NOLINT
+
+}  // namespace gpc
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/paddle/fluid/operators/detection/gpc.h b/paddle/fluid/operators/detection/gpc.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee86262ef2c486e4eaeeeaf56c2392d2a1c5851b
--- /dev/null
+++ b/paddle/fluid/operators/detection/gpc.h
@@ -0,0 +1,246 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/***************************************************************************
+ *
+ * Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved
+ *
+ **************************************************************************/
+
+/**
+ * @file include/gpc.h
+ * @author huhan02(com@baidu.com)
+ * @date 2015/12/18 13:52:10
+ * @brief
+ *
+ * @modified by sunyipeng
+ * @email sunyipeng@baidu.com
+ * @date 2018/6/12
+ **/
+
+#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace gpc {
+
+typedef enum {  // Set operation type
+  GPC_DIFF,     // Difference
+  GPC_INT,      // Intersection
+  GPC_XOR,      // Exclusive or
+  GPC_UNION     // Union
+} gpc_op;
+
+typedef struct {  // Polygon vertex structure
+  double x;       // Vertex x component
+  double y;       // vertex y component
+} gpc_vertex;
+
+typedef struct {       // Vertex list structure
+  int num_vertices;    // Number of vertices in list
+  gpc_vertex *vertex;  // Vertex array pointer
+} gpc_vertex_list;
+
+typedef struct {             // Polygon set structure
+  int num_contours;          // Number of contours in polygon
+  int *hole;                 // Hole  external contour flags
+  gpc_vertex_list *contour;  // Contour array pointer
+} gpc_polygon;
+
+typedef struct {           // Tristrip set structure
+  int num_strips;          // Number of tristrips
+  gpc_vertex_list *strip;  // Tristrip array pointer
+} gpc_tristrip;
+
+typedef enum { LEFT, RIGHT } gpc_left_right;
+
+typedef enum { ABOVE, BELOW } gpc_above_below;
+
+typedef enum { CLIP, SUBJ } gpc_clip_subj;
+
+typedef enum {      /* Edge intersection classes         */
+               NUL, /* Empty non-intersection            */
+               EMX, /* External maximum                  */
+               ELI, /* External left intermediate        */
+               TED, /* Top edge                          */
+               ERI, /* External right intermediate       */
+               RED, /* Right edge                        */
+               IMM, /* Internal maximum and minimum      */
+               IMN, /* Internal minimum                  */
+               EMN, /* External minimum                  */
+               EMM, /* External maximum and minimum      */
+               LED, /* Left edge                         */
+               ILI, /* Internal left intermediate        */
+               BED, /* Bottom edge                       */
+               IRI, /* Internal right intermediate       */
+               IMX, /* Internal maximum                  */
+               FUL  /* Full non-intersection             */
+} vertex_type;
+
+typedef enum {     /* Horizontal edge states            */
+               NH, /* No horizontal edge                */
+               BH, /* Bottom horizontal edge            */
+               TH  /* Top horizontal edge               */
+} h_state;
+
+typedef enum {              /* Edge bundle state                 */
+               UNBUNDLED,   /* Isolated edge not within a bundle */
+               BUNDLE_HEAD, /* Bundle head node                  */
+               BUNDLE_TAIL  /* Passive bundle tail node          */
+} bundle_state;
+
+typedef struct v_shape { /* Internal vertex list datatype     */
+  double x;              /* X coordinate component            */
+  double y;              /* Y coordinate component            */
+  struct v_shape *next;  /* Pointer to next vertex in list    */
+} vertex_node;
+
+typedef struct p_shape { /* Internal contour / tristrip type  */
+  int active;            /* Active flag / vertex count        */
+  int hole;              /* Hole / external contour flag      */
+  vertex_node *v[2];     /* Left and right vertex list ptrs   */
+  struct p_shape *next;  /* Pointer to next polygon contour   */
+  struct p_shape *proxy; /* Pointer to actual structure used  */
+} polygon_node;
+
+typedef struct edge_shape {
+  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
+  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
+  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
+  double xb;                     /* Scanbeam bottom x coordinate      */
+  double xt;                     /* Scanbeam top x coordinate         */
+  double dx;                     /* Change in x for a unit y increase */
+  int type;                      /* Clip / subject edge flag          */
+  int bundle[2][2];              /* Bundle edge flags                 */
+  int bside[2];                  /* Bundle left / right indicators    */
+  bundle_state bstate[2];        /* Edge bundle state                 */
+  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
+  struct edge_shape *prev;       /* Previous edge in the AET          */
+  struct edge_shape *next;       /* Next edge in the AET              */
+  struct edge_shape *pred;       /* Edge connected at the lower end   */
+  struct edge_shape *succ;       /* Edge connected at the upper end   */
+  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
+} edge_node;
+
+inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
+
+inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
+
+inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
+
+inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
+
+inline int gpc_optimal(gpc_vertex *v, int i, int n) {
+  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
+}
+
+inline int gpc_fwd_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
+}
+
+inline int gpc_not_fmax(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
+}
+
+inline int gpc_rev_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+
+inline int gpc_not_rmax(edge_node *v, int i, int n) {
+  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+
+// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->prev;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+
+// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->next;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+
+template <typename T>
+void gpc_malloc(T *&p, int b, char *s) {
+  if (b > 0) {
+    p = (T *)malloc(b);
+
+    if (!p) {
+      fprintf(stderr, "gpc malloc failure: %s\n", s);
+      exit(0);
+    }
+  } else {
+    p = NULL;
+  }
+}
+template <typename T>
+void gpc_free(T *&p) {
+  if (p) {
+    free(p);
+    p = NULL;
+  }
+}
+
+/*
+===========================================================================
+                       Public Function Prototypes
+===========================================================================
+*/
+
+void add_vertex(vertex_node **t, double x, double y);
+
+void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
+
+/*
+void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags,
+                      gpc_polygon *polygon);
+
+void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags,
+                       gpc_polygon *polygon);
+*/
+void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
+
+void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
+
+void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                       gpc_polygon *clip_polygon,
+                       gpc_tristrip *result_tristrip);
+
+void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
+
+void gpc_free_polygon(gpc_polygon *polygon);
+
+void gpc_free_tristrip(gpc_tristrip *tristrip);
+
+}  // namespace gpc
+
+#endif  // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 60b93efdce810f8552374449fe5a6fc79b1a92c1..9e78b28a6011bb7bd299ca3438eb407f600d7000 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -9,10 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/poly_util.h"
 
 namespace paddle {
 namespace operators {
@@ -20,9 +21,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-constexpr int64_t kOutputDim = 6;
-constexpr int64_t kBBoxSize = 4;
-
 class MultiClassNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -42,10 +40,15 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
                       "The rank of Input(BBoxes) must be 3.");
     PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                       "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[2], 4,
-                      "The 2nd dimension of Input(BBoxes) must be 4, "
-                      "represents the layout of coordinate "
-                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
+                       box_dims[2] == 24 || box_dims[2] == 32,
+                   "The 2nd dimension of Input(BBoxes) must be 4 or 8, "
+                   "represents the layout of coordinate "
+                   "[xmin, ymin, xmax, ymax] or "
+                   "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                   "8 points: [xi, yi] i= 1,2,...,8 or "
+                   "12 points: [xi, yi] i= 1,2,...,12 or "
+                   "16 points: [xi, yi] i= 1,2,...,16");
     PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
                       "The 1st dimensiong of Input(BBoxes) must be equal to "
                       "3rd dimension of Input(Scores), which represents the "
@@ -53,7 +56,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
 
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[1], 6});
+    ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
   }
 
  protected:
@@ -128,6 +131,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
   }
 }
 
+template <class T>
+T PolyIoU(const T* box1, const T* box2, const size_t box_size,
+          const bool normalized) {
+  T bbox1_area = PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = PolyArea<T>(box2, box_size, normalized);
+  T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
+  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
+    // If coordinate values are is invalid
+    // if area size <= 0,  return 0.
+    return T(0.);
+  } else {
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
 template <typename T>
 class MultiClassNMSKernel : public framework::OpKernel<T> {
  public:
@@ -137,6 +155,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     // The total boxes for each instance.
     int64_t num_boxes = bbox.dims()[0];
     // 4: [xmin ymin xmax ymax]
+    // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+    // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
     int64_t box_size = bbox.dims()[1];
 
     std::vector<T> scores_data(num_boxes);
@@ -154,8 +174,19 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       for (size_t k = 0; k < selected_indices->size(); ++k) {
         if (keep) {
           const int kept_idx = (*selected_indices)[k];
-          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+          T overlap = T(0.);
+          // 4: [xmin ymin xmax ymax]
+          if (box_size == 4) {
+            overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                         bbox_data + kept_idx * box_size, true);
+          }
+          // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+          if (box_size == 8 || box_size == 16 || box_size == 24 ||
+              box_size == 32) {
+            overlap =
+                PolyIoU<T>(bbox_data + idx * box_size,
+                           bbox_data + kept_idx * box_size, box_size, true);
+          }
           keep = overlap <= adaptive_threshold;
         } else {
           break;
@@ -228,7 +259,9 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
   void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
                         const std::map<int, std::vector<int>>& selected_indices,
                         Tensor* outs) const {
-    int predict_dim = scores.dims()[1];
+    int64_t predict_dim = scores.dims()[1];
+    int64_t box_size = bboxes.dims()[1];
+    int64_t out_dim = bboxes.dims()[1] + 2;
     auto* scores_data = scores.data<T>();
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
@@ -240,11 +273,11 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       const std::vector<int>& indices = it.second;
       for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
-        const T* bdata = bboxes_data + idx * kBBoxSize;
-        odata[count * kOutputDim] = label;           // label
-        odata[count * kOutputDim + 1] = sdata[idx];  // score
-        // xmin, ymin, xmax, ymax
-        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+        const T* bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim] = label;           // label
+        odata[count * out_dim + 1] = sdata[idx];  // score
+        // xmin, ymin, xmax, ymax or multi-points coordinates
+        std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
         count++;
       }
     }
@@ -261,6 +294,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t class_num = score_dims[1];
     int64_t predict_dim = score_dims[2];
     int64_t box_dim = boxes->dims()[2];
+    int64_t out_dim = boxes->dims()[2] + 2;
 
     std::vector<std::map<int, std::vector<int>>> all_indices;
     std::vector<size_t> batch_starts = {0};
@@ -283,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
       od[0] = -1;
     } else {
-      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
       for (int64_t i = 0; i < batch_size; ++i) {
         Tensor ins_score = scores->Slice(i, i + 1);
         ins_score.Resize({class_num, predict_dim});
@@ -311,10 +345,11 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("BBoxes",
-             "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
+             "(Tensor) A 3-D Tensor with shape "
+             "[N, M, 4 or 8 16 24 32] represents the "
              "predicted locations of M bounding bboxes, N is the batch size. "
              "Each bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].");
+             "[xmin, ymin, xmax, ymax], when box size equals to 4.");
     AddInput("Scores",
              "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
              "predicted confidence predictions. N is the batch size, C is the "
@@ -351,8 +386,12 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
               "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
-              "number of detections in this mini-batch. For each instance, "
+              "[label, confidence, xmin, ymin, xmax, ymax] or "
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the "
+              "detections. Each row has 10 values: "
+              "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the "
+              "total number of detections in this mini-batch."
+              "For each instance, "
               "the offsets in first dimension are called LoD, the number of "
               "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
               "no detected bbox.");
diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1af2c95c6cf526d651b196b54614a21a9cddde8c
--- /dev/null
+++ b/paddle/fluid/operators/detection/poly_util.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef POLY_UTIL_CC_
+#define POLY_UTIL_CC_
+
+#include "paddle/fluid/operators/detection/poly_util.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using gpc::gpc_polygon_clip;
+using gpc::gpc_free_polygon;
+
+template <class T>
+void Array2PointVec(const T*& box, const size_t box_size,
+                    std::vector<Point_<T>>& vec) {
+  size_t pts_num = box_size / 2;
+  vec.resize(pts_num);
+  for (size_t i = 0; i < pts_num; i++) {
+    vec.at(i).x = box[2 * i];
+    vec.at(i).y = box[2 * i + 1];
+  }
+}
+
+template <class T>
+void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) {
+  size_t pts_num = box_size / 2;
+  poly.num_contours = 1;
+  poly.hole = (int*)malloc(sizeof(int));
+  poly.hole[0] = 0;
+  poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  poly.contour->num_vertices = pts_num;
+  poly.contour->vertex =
+      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+  for (size_t i = 0; i < pts_num; ++i) {
+    poly.contour->vertex[i].x = box[2 * i];
+    poly.contour->vertex[i].y = box[2 * i + 1];
+  }
+}
+
+template <class T>
+void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly) {
+  int pts_num = vec.size();
+  poly.num_contours = 1;
+  poly.hole = (int*)malloc(sizeof(int));
+  poly.hole[0] = 0;
+  poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  poly.contour->num_vertices = pts_num;
+  poly.contour->vertex =
+      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+  for (size_t i = 0; i < pts_num; ++i) {
+    poly.contour->vertex[i].x = vec[i].x;
+    poly.contour->vertex[i].y = vec[i].y;
+  }
+}
+
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>& vec) {
+  int pts_num = contour.num_vertices;
+  vec.resize(pts_num);
+  for (int i = 0; i < pts_num; i++) {
+    vec.at(i).x = contour.vertex[i].x;
+    vec.at(i).y = contour.vertex[i].y;
+  }
+}
+
+template <class T>
+T GetContourArea(std::vector<Point_<T>>& vec) {
+  size_t pts_num = vec.size();
+  if (pts_num < 3) return T(0.);
+  T area = T(0.);
+  for (size_t i = 0; i < pts_num; ++i) {
+    area += vec[i].x * vec[(i + 1) % pts_num].y -
+            vec[i].y * vec[(i + 1) % pts_num].x;
+  }
+  return std::fabs(area / 2.0);
+}
+
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized) {
+  // If coordinate values are is invalid
+  // if area size <= 0,  return 0.
+  std::vector<Point_<T>> vec;
+  Array2PointVec<T>(box, box_size, vec);
+  return GetContourArea<T>(vec);
+}
+
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized) {
+  gpc::gpc_polygon poly1;
+  gpc::gpc_polygon poly2;
+  Array2Poly<T>(box1, box_size, poly1);
+  Array2Poly<T>(box2, box_size, poly2);
+  gpc::gpc_polygon respoly;
+  gpc::gpc_op op = gpc::GPC_INT;
+  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
+
+  T inter_area = T(0.);
+  int contour_num = respoly.num_contours;
+  for (int i = 0; i < contour_num; ++i) {
+    std::vector<Point_<T>> resvec;
+    Poly2PointVec<T>(respoly.contour[i], resvec);
+    // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f *
+    // (cv::arcLength(resvec, true));
+    inter_area += GetContourArea<T>(resvec);
+  }
+
+  gpc::gpc_free_polygon(&poly1);
+  gpc::gpc_free_polygon(&poly2);
+  gpc::gpc_free_polygon(&respoly);
+  return inter_area;
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f07baf72d9ff07b8fcb45dcfb2a35741fb1aeed0
--- /dev/null
+++ b/paddle/fluid/operators/detection/poly_util.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef POLY_UTIL_H_
+#define POLY_UTIL_H_
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/gpc.h"
+
+namespace paddle {
+namespace operators {
+
+template <class T>
+class Point_ {
+ public:
+  // default constructor
+  Point_() {}
+  Point_(T _x, T _y) {}
+  Point_(const Point_& pt) {}
+
+  Point_& operator=(const Point_& pt);
+  // conversion to another data type
+  // template<typename _T> operator Point_<_T>() const;
+  // conversion to the old-style C structures
+  // operator Vec<T, 2>() const;
+
+  // checks whether the point is inside the specified rectangle
+  // bool inside(const Rect_<T>& r) const;
+  T x;  //!< x coordinate of the point
+  T y;  //!< y coordinate of the point
+};
+
+template <class T>
+void Array2PointVec(const T*& box, const size_t box_size,
+                    std::vector<Point_<T>>& vec);
+
+template <class T>
+void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly);
+
+template <class T>
+void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly);
+
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>& vec);
+
+template <class T>
+T GetContourArea(std::vector<Point_<T>>& vec);
+
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized);
+
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized);
+}  // namespace operators
+}  // namespace paddle
+
+#include "paddle/fluid/operators/detection/poly_util.cc"
+
+#endif  // POLY_UTIL_H_
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 568d50d457d838d5f11605710c0d3b987af01d10..4b3bc2edb58fe23393d906094c41b6ad62c71155 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -41,9 +41,9 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
         for (int id_w = 0; id_w < width; ++id_w) {
           id = id_n * height * width + width * id_h + id_w;
           if (id_n % 2 == 0) {
-            out_data[id] = id_w - in_data[id];
+            out_data[id] = id_w * 4 - in_data[id];
           } else {
-            out_data[id] = id_h - in_data[id];
+            out_data[id] = id_h * 4 - in_data[id];
           }
         }
       }
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 6187ac6622c65d2bbc525c3fe2cb397cf74ac612..e1eaf084a3413dd1d13514e2d7b22572d21dd119 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -32,9 +32,9 @@ __global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
   if (id_n < n && id_h < h && id_w < w) {
     int id = id_n * h * w + w * id_h + id_w;
     if (id_n % 2 == 0) {
-      output[id] = id_w - input[id];
+      output[id] = id_w * 4 - input[id];
     } else {
-      output[id] = id_h - input[id];
+      output[id] = id_h * 4 - input[id];
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index dda423efd35b96f5e1d7c55389818f46ef3d8694..46fff9d338b7759496faaf6dd9960d34887755ba 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -52,6 +52,9 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->HasOutput("TargetBBox"),
         "Output(TargetBBox) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("BBoxInsideWeight"),
+        "Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null");
 
     auto anchor_dims = ctx->GetInputDim("Anchor");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -68,6 +71,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ScoreIndex", {-1});
     ctx->SetOutputDim("TargetLabel", {-1, 1});
     ctx->SetOutputDim("TargetBBox", {-1, 4});
+    ctx->SetOutputDim("BBoxInsideWeight", {-1, 4});
   }
 
  protected:
@@ -169,6 +173,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
                  const float rpn_positive_overlap,
                  const float rpn_negative_overlap, std::vector<int>* fg_inds,
                  std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
+                 std::vector<int>* fg_fake, std::vector<T>* bbox_inside_weight,
                  std::minstd_rand engine, bool use_random) {
   float epsilon = 0.00001;
   int anchor_num = anchor_to_gt_max.dims()[0];
@@ -201,12 +206,12 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
   // Reservoir Sampling
   int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
   ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  fg_num = static_cast<int>(fg_inds_fake.size());
-  for (int64_t i = 0; i < fg_num; ++i) {
+  int fg_fake_num = static_cast<int>(fg_inds_fake.size());
+  for (int64_t i = 0; i < fg_fake_num; ++i) {
     target_label[fg_inds_fake[i]] = 1;
   }
 
-  int bg_num = rpn_batch_size_per_im - fg_num;
+  int bg_num = rpn_batch_size_per_im - fg_fake_num;
   for (int64_t i = 0; i < anchor_num; ++i) {
     if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
       bg_inds_fake.push_back(i);
@@ -214,12 +219,28 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
   }
   ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
   bg_num = static_cast<int>(bg_inds_fake.size());
+  int fake_num = 0;
   for (int64_t i = 0; i < bg_num; ++i) {
+    // fg fake found
+    if (target_label[bg_inds_fake[i]] == 1) {
+      fake_num++;
+      fg_fake->emplace_back(fg_inds_fake[0]);
+      for (int j = 0; j < 4; ++j) {
+        bbox_inside_weight->emplace_back(T(0.));
+      }
+    }
     target_label[bg_inds_fake[i]] = 0;
   }
 
+  for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) {
+    bbox_inside_weight->emplace_back(T(1.));
+  }
+
   for (int64_t i = 0; i < anchor_num; ++i) {
-    if (target_label[i] == 1) fg_inds->emplace_back(i);
+    if (target_label[i] == 1) {
+      fg_inds->emplace_back(i);
+      fg_fake->emplace_back(i);
+    }
     if (target_label[i] == 0) bg_inds->emplace_back(i);
   }
   fg_num = fg_inds->size();
@@ -248,7 +269,8 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
   std::vector<int> bg_inds;
   std::vector<int> gt_inds;
   std::vector<int> tgt_lbl;
-
+  std::vector<int> fg_fake;
+  std::vector<T> bbox_inside_weight;
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
@@ -275,32 +297,37 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
   // Follow the Faster RCNN's implementation
   ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
               rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
-              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
-              use_random);
+              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, &fg_fake,
+              &bbox_inside_weight, engine, use_random);
 
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  gt_inds.reserve(fg_num);
-  for (int i = 0; i < fg_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_inds[i]]);
+  int fg_fake_num = fg_fake.size();
+  gt_inds.reserve(fg_fake_num);
+  for (int i = 0; i < fg_fake_num; ++i) {
+    gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
-
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
+  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
+  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
   int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
-  std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
+  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
+  T* bbox_inside_weight_data =
+      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
+  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
   std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
   std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
   std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
   std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
+  std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(),
+            bbox_inside_weight_data);
   std::vector<Tensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
   loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
+  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
 
   return loc_score_tgtlbl_gt;
 }
@@ -318,6 +345,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     auto* score_index = context.Output<LoDTensor>("ScoreIndex");
     auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
     auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
+    auto* bbox_inside_weight = context.Output<LoDTensor>("BBoxInsideWeight");
 
     PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
                       "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
@@ -340,7 +368,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     score_index->mutable_data<int>({max_num}, place);
     tgt_bbox->mutable_data<T>({max_num, 4}, place);
     tgt_lbl->mutable_data<int>({max_num, 1}, place);
-
+    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
 
     std::random_device rnd;
@@ -394,6 +422,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
       Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
       Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
       Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
@@ -432,6 +461,8 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
       AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
       AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
       AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
+      AppendRpns<T>(bbox_inside_weight, total_loc_num * 4,
+                    &sampled_bbox_inside_weight);
       total_loc_num += loc_num;
 
       total_score_num += score_num;
@@ -448,10 +479,12 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     score_index->set_lod(loc_score);
     tgt_bbox->set_lod(lod_loc);
     tgt_lbl->set_lod(loc_score);
+    bbox_inside_weight->set_lod(lod_loc);
     loc_index->Resize({total_loc_num});
     score_index->Resize({total_score_num});
     tgt_bbox->Resize({total_loc_num, 4});
     tgt_lbl->Resize({total_score_num, 1});
+    bbox_inside_weight->Resize({total_loc_num, 4});
   }
 };
 
@@ -514,6 +547,9 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "TargetLabel",
         "(Tensor<int>), The target labels of each anchor with shape "
         "[F + B, 1], F and B are sampled foreground and backgroud number.");
+    AddOutput("BBoxInsideWeight",
+              "(Tensor), The bbox inside weight with shape "
+              "[F, 4], F is the sampled foreground number.");
     AddComment(R"DOC(
 This operator can be, for a given set of ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 56734b81e8716a0c0c37a11e35c9118ee7b55020..21db93958a4a586c74a1e060f1f04b5af1dcd889 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-  cc_test(varhandle_test SRCS varhandle_test.cc)
+  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
   return()
 endif()
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 13682b78f0eccf049daa315f3a26aafd22e42a41..f5d5627815c7320dad5051b0f7d95b8ec6703687 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-
 #include <sys/time.h>
-
 #include <limits>
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -73,10 +71,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   SendProcessor* s = new SendProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
+  const std::string method = "SendRPC";
+  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -87,10 +86,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     // stub context
     s->response_call_back_ = nullptr;
 
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
   });
   req_count_++;
 
@@ -122,10 +127,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
+  const std::string method = "GetRPC";
+  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, this] {
+  framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -137,10 +143,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
   });
 
   req_count_++;
@@ -161,12 +173,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  VarHandlePtr h(
-      new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
+
+  const std::string method = "PrefetchRPC";
+
+  VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, this] {
+                      s, method, h, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -177,11 +191,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
     // stub context
     s->response_call_back_ = ProcGetResponse;
 
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
         &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
   });
 
   req_count_++;
@@ -193,15 +213,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
-                               nullptr, nullptr));
+  const std::string method = "BatchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -209,15 +238,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
-                               nullptr, nullptr));
+  const std::string method = "FetchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -226,15 +264,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  VarHandlePtr h(
-      new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
+  const std::string method = "SendCompleteRPC";
+  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(COMPLETE_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -244,17 +290,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
-                               nullptr, nullptr));
+
+  const std::string method = "CheckPointNotifyRPC";
+
+  VarHandlePtr h(
+      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
   req.set_out_varname(dir);
 
+  platform::RecordRPCEvent record_event(method, nullptr);
+
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -273,12 +329,16 @@ void GRPCClient::Proceed() {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
+
     if (c->status_.ok()) {
       VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      // FIXME(gongwb): parse error_details?
       LOG(ERROR) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error:" << c->status_.error_message();
+                 << " meets grpc error, error_code:" << c->status_.error_code()
+                 << " error_message:" << c->status_.error_message()
+                 << " error_details:" << c->status_.error_details();
       {
         std::lock_guard<std::mutex> lk(sync_mutex_);
         ok_ = false;
@@ -286,7 +346,10 @@ void GRPCClient::Proceed() {
       c->Finish(false);
     } else {
       LOG(FATAL) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error:" << c->status_.error_message();
+                 << " meets grpc error, error_code:" << c->status_.error_code()
+                 << " error_message:" << c->status_.error_message()
+                 << " error_details:" << c->status_.error_details();
+
       c->Finish(false);
     }
 
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index 3f8796713a6b89a308113981614673e07e8d367f..bac098b892658beece85271765eb31eeb3eeda17 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_name) {
+  platform::RecordRPCEvent record_event("serial", &ctx);
   // Default DestroyCallback does nothing, When using GPU
   // the CPU buffer need to be freed.
   DestroyCallback destroy_callback = [](void* backing) {};
@@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
                                const framework::Scope* scope,
                                framework::Variable** var) {
+  platform::RecordRPCEvent record_event("deserial", &ctx);
   operators::distributed::GRPCVariableResponse resp(scope, &ctx);
   PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
   *var = resp.GetVar();
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 07322e720f26213ea777be3cd22f2fead28507f0..3c28ef30922e6d6ba09b96282619eef15867631e 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dropout_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                   "will be dropped.")
         .SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "There are two kinds of ways to implement dropout"
+        "(the mask below is a tensor have the same shape with input"
+        "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)"
+        "1. downgrade_in_infer(default), downgrade the outcome at inference "
+        "time"
+        "   train: out = input * mask"
+        "   inference: out = input * dropout_prob"
+        "2. upscale_in_train, upscale the outcome at training time, do nothing "
+        "in inference"
+        "   train: out = input * mask / ( 1.0 - dropout_prob )"
+        "   inference: out = input"
+        "   dropout op can be removed from the program. the program will be "
+        "efficient")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string& type) {
+          PADDLE_ENFORCE(
+              type == "downgrade_in_infer" || type == "upscale_in_train",
+              "dropout_implementation can only be downgrade_in_infer or "
+              "upscale_in_train");
+        });
 
     AddComment(R"DOC(
 Dropout Operator.
@@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9..e011f47e086183a4ef3a3373c17acd6c21b6cf7e 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include <string>
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -26,7 +27,8 @@ namespace operators {
 template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
                                 const float dropout_prob, const T* src,
-                                T* mask_data, T* dst) {
+                                T* mask_data, T* dst,
+                                bool is_upscale_in_train) {
   thrust::minstd_rand rng;
   rng.seed(seed);
   thrust::uniform_real_distribution<float> dist(0, 1);
@@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed,
     if (dist(rng) < dropout_prob) {
       mask = static_cast<T>(0);
     } else {
-      mask = static_cast<T>(1);
+      if (is_upscale_in_train) {
+        mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
+      } else {
+        mask = static_cast<T>(1);
+      }
     }
     dest = s * mask;
     mask_data[idx] = mask;
@@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
@@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       int grid = (x->numel() + threads - 1) / threads;
       RandomGenerator<
           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-          size, seed, dropout_prob, x_data, mask_data, y_data);
+          size, seed, dropout_prob, x_data, mask_data, y_data,
+          (dropout_implementation == "upscale_in_train"));
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      if (dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      }
     }
   }
 };
@@ -99,6 +112,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(dropout_grad,
-                        ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
+    ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 0628b4b826d2730a8e3fb4842e4ae550b8c00569..6c629b7b6d255828023ed25680675ca104a33e12 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <random>
+#include <string>
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
@@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       engine.seed(seed);
 
       std::uniform_real_distribution<float> dist(0, 1);
+
       size_t size = framework::product(mask->dims());
       for (size_t i = 0; i < size; ++i) {
         if (dist(engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
-          mask_data[i] = 1;
-          y_data[i] = x_data[i];
+          if (dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
+            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            y_data[i] = x_data[i];
+          }
         }
       }
     } else {
@@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * (1.0f - dropout_prob);
+      if (dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 7e5975ead64ab39a9c618a33e300c4fce55a5b22..68c6e315cc3b5fa932f8946f6d4f838f4d3fc5a5 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -80,8 +80,6 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
-    // memory.").AsIntermediate();
     AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
@@ -129,13 +127,11 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
-    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
-  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -269,7 +265,6 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
    protected:                                                          \
     virtual std::string GetName() const { return op_name; }            \
     virtual std::string GetEquation() const { return equation; }       \
-    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
   };                                                                   \
   REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
                     __ElemwiseOp##op_type##Maker__,                    \
diff --git a/paddle/fluid/operators/fake_init_op.cc b/paddle/fluid/operators/fake_init_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28ebdcb03ea83f3ec701106111a7cc5f0f7ed7dc
--- /dev/null
+++ b/paddle/fluid/operators/fake_init_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class FakeInitInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeInitOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FakeInitOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fake init op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+  }
+};
+
+class FakeInitOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FakeInit Operator.
+
+Init an variable but not alloc memory for it, it is used for init the
+table parameter at trainer side in distributed lookup table.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fake_init, ops::FakeInitOp, ops::FakeInitInferShape,
+                  ops::FakeInitOpMaker, paddle::framework::EmptyGradOpMaker,
+                  ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 2826b82117db113d4d8c10095e89f610ca895775..252f313440296bd9e5eebf26f67b08bbe7decce8 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -24,7 +24,7 @@ class FillConstantInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
 };
@@ -47,10 +47,10 @@ class FillConstantOp : public framework::OperatorBase {
 
     if (out_var.IsType<framework::LoDTensor>()) {
       tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else if (out_var.IsType<framework::SelectedRows>()) {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
       PADDLE_THROW(
           "fill constant op's output only"
@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
   }
 };
 
+class FillConstantOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -77,7 +83,8 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
     AddAttr<bool>("force_cpu",
@@ -102,4 +109,5 @@ Fill up a variable with specified constant value.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
                   ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantOpVarTypeInference);
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index a04c1c1263fba659e2d3f623b607e9f476bb40ed..120b2ab440156f6020fd6005dd64a48e9a6918ec 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -16,10 +16,9 @@ limitations under the License. */
 #include <cstring>  // for memcpy
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -174,58 +173,44 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     }
   }
 
-#define INIT_VEC_FUNC                                                     \
-  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
-  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
-  auto& act_state_str = ctx.Attr<std::string>("activation");              \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
-    math::VecActivations<T, platform::jit::avx> act_functor;              \
-    act_gate = act_functor(act_gate_str);                                 \
-    act_state = act_functor(act_state_str);                               \
-    cross = math::vec_cross<T, platform::jit::avx>;                       \
-  } else {                                                                \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
-    act_gate = act_functor(act_gate_str);                                 \
-    act_state = act_functor(act_state_str);                               \
-    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
-  }
-
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* wx = ctx.Input<Tensor>("WeightX");            \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-#define INIT_BASE_SIZES                  \
-  auto x_dims = x->dims();   /* T x M*/  \
-  auto wh_dims = wh->dims(); /* D x 3D*/ \
-  const int total_T = x_dims[0];         \
-  const int M = x_dims[1];               \
-  const int D = wh_dims[0];              \
-  const int D3 = wh_dims[1];             \
-  const int D2 = D * 2;
+#define INIT_BASE_DEFINES                  \
+  auto* x = ctx.Input<LoDTensor>("X");     \
+  auto* wh = ctx.Input<Tensor>("WeightH"); \
+  auto* xx = ctx.Output<LoDTensor>("XX");  \
+  auto x_lod = x->lod();                   \
+  auto x_dims = x->dims();   /* T x M*/    \
+  auto wh_dims = wh->dims(); /* D x 3D*/   \
+  const int total_T = x_dims[0];           \
+  const int D3 = wh_dims[1]
+
+#define INIT_OTHER_DEFINES                                                     \
+  auto* h0 = ctx.Input<Tensor>("H0");                                          \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
+  auto* bias = ctx.Input<Tensor>("Bias");                                      \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
+  const int M = x_dims[1];                                                     \
+  const int D = wh_dims[0];                                                    \
+  const int D2 = D * 2;                                                        \
+  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
+                        .template Get<math::jitkernel::GRUKernel<T>,           \
+                                      const std::string&, const std::string&>( \
+                            ctx.Attr<std::string>("gate_activation"),          \
+                            ctx.Attr<std::string>("activation"), D);           \
+  const T* x_data = x->data<T>();                                              \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  auto place = ctx.GetPlace();                                                 \
+  T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-
-    auto x_lod = x->lod();
+    INIT_BASE_DEFINES;
+    INIT_OTHER_DEFINES;
     const int N = x_lod[0].size() - 1;
-    const T* x_data = x->data<T>();
     const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
     const T* wh_state_data = wh_data + D * D2;
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-
+    T* hidden_out_data = hidden_out->mutable_data<T>(place);
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
                                       xx_data,
@@ -252,14 +237,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       if (h0_data) {
         prev_hidden_data = h0_data + bid * D;
       } else {
-        // W: {W_update, W_reset; W_state}
-        // update gate
-        act_gate(D, xx_data, xx_data);
-        // state gate
-        act_state(D, xx_data + D2, xx_data + D2);
-        // out = a*b
-        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
-        // save prev
+        ker->ComputeH1(xx_data, hidden_out_data);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -269,17 +247,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                   prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                   D3);
-        act_gate(D2, xx_data, xx_data);
-        // rt = rt*ht_1 inplace result
-        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
-
+        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        act_state(D, xx_data + D2, xx_data + D2);
-        // out = zt*ht~ + (1-zt)*ht_1
-        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -289,28 +262,19 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    if (x->lod()[0].size() == 2) {
+    INIT_BASE_DEFINES;
+    if (x_lod[0].size() == 2) {
       xx->Resize({total_T, D3});
       SeqCompute(ctx);
       return;
     }
-    INIT_VEC_FUNC
-
+    INIT_OTHER_DEFINES;
     auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
-
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
-    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_out_data = batched_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
@@ -336,7 +300,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     T* prev_hidden_data = nullptr;
     if (h0) {
       // reorder h0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
       const T* h0_data = h0->data<T>();
       prev_hidden_data = reordered_h0_data;
       size_t sz = sizeof(T) * D;
@@ -350,12 +314,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       // W: {W_update, W_reset; W_state}
       for (int i = 0; i < max_bs; ++i) {
-        // update gate
-        act_gate(D, cur_in_data, cur_in_data);
-        // state gate
-        act_state(D, cur_in_data + D2, cur_in_data + D2);
-        // out = a*b
-        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        ker->ComputeH1(cur_in_data, cur_out_data);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -380,10 +339,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       T* cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        act_gate(D2, cur_batched_data, cur_batched_data);
-        // rt = rt*ht_1 inplace result
-        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
-
+        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
+                            cur_out_data);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -397,12 +354,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
       cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        // ht~ = act_state(...)
-        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
-        // out = zt*ht~ + (1-zt)*ht_1
-        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
-              cur_out_data);
-
+        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
+                            cur_out_data);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -416,9 +369,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     batched_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_out, hidden_out);
   }
-#undef INIT_VEC_FUNC
-#undef INIT_BASE_SIZES
-#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_OTHER_DEFINES
+#undef INIT_BASE_DEFINES
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ae1f6d8e489039667d861a69acabf2c632ef2061..067e6a3e7cccc1f15ebdd984f3a2441339a989ab 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
  public:
-#define INIT_VEC_FUNC                                                          \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
-    math::VecActivations<T, platform::jit::avx> act_functor;                   \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  } else {                                                                     \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  }
-
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* x = ctx.Input<LoDTensor>("X");                \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* c0 = ctx.Input<Tensor>("C0");                 \
-  auto* wx = ctx.Input<Tensor>("WeightX");            \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-
-#define INIT_BASE_SIZES                  \
-  auto x_dims = x->dims();   /* T x M*/  \
-  auto wh_dims = wh->dims(); /* D x 4D*/ \
-  const int M = x_dims[1];               \
-  const int D = wh_dims[0];              \
-  const int D2 = D * 2;                  \
-  const int D3 = D * 3;                  \
-  const int D4 = wh_dims[1];
-
-#define INIT_BASE_INPUT_DATAS                                 \
-  const T* x_data = x->data<T>();                             \
-  const T* wx_data = wx->data<T>();                           \
-  const T* wh_data = wh->data<T>();                           \
-  /* diagonal weight*/                                        \
-  const T* wc_data = bias->data<T>() + D4;                    \
-  /* for peephole only*/                                      \
-  T* checked_cell_data = nullptr;                             \
-  auto place = ctx.GetPlace();                                \
-  if (use_peepholes) {                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");   \
-    checked_cell_data = checked_cell->mutable_data<T>(place); \
-  }
-
-/// Compute LSTM
+#define INIT_BASE_DEFINES                                   \
+  using DeviceContext = paddle::platform::CPUDeviceContext; \
+  auto* x = ctx.Input<LoDTensor>("X");                      \
+  auto* h0 = ctx.Input<Tensor>("H0");                       \
+  auto* c0 = ctx.Input<Tensor>("C0");                       \
+  auto* wx = ctx.Input<Tensor>("WeightX");                  \
+  auto* wh = ctx.Input<Tensor>("WeightH");                  \
+  auto* bias = ctx.Input<Tensor>("Bias");                   \
+  auto* xx = ctx.Output<LoDTensor>("XX");                   \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");       \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");           \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");           \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");     \
+  auto x_dims = x->dims();   /* T x M*/                     \
+  auto wh_dims = wh->dims(); /* D x 4D*/                    \
+  const int M = x_dims[1];                                  \
+  const int D = wh_dims[0];                                 \
+  const int D4 = wh_dims[1]
+
+#define INIT_OTHER_DEFINES                                                  \
+  const T* x_data = x->data<T>();                                           \
+  const T* wx_data = wx->data<T>();                                         \
+  const T* wh_data = wh->data<T>();                                         \
+  /* diagonal weight*/                                                      \
+  const T* wp_data = bias->data<T>() + D4;                                  \
+  /* for peephole only*/                                                    \
+  T* checked_cell_data = nullptr;                                           \
+  auto place = ctx.GetPlace();                                              \
+  if (use_peepholes) {                                                      \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
+    checked_cell_data = checked_cell->mutable_data<T>(place);               \
+  }                                                                         \
+  const auto& ker =                                                         \
+      math::jitkernel::KernelPool::Instance()                               \
+          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
+                        const std::string&, const std::string&>(            \
+              ctx.Attr<std::string>("gate_activation"),                     \
+              ctx.Attr<std::string>("candidate_activation"),                \
+              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+
+// Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
   blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
             wh_data, D4, static_cast<T>(1), out, D4)
 
-#define GET_Ct(ct_1, gates, ct)                   \
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
-  act_cand(D, gates, gates);                      \
-  blas.VMUL(D, gates, gates + D, gates + D);      \
-  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
-  blas.VADD(D, gates + D, gates + D2, ct)
-
-#define GET_Ht(ct, gates, ht)        \
-  /* H_t = act_cell(C_t) * ogated */ \
-  act_cell(D, ct, gates + D2);       \
-  blas.VMUL(D, gates + D2, gates + D3, ht)
-
-#define GET_Ct_NOH0C0(gates, ct)     \
-  /* C_t = igated * cgated*/         \
-  act_gate(D, gates + D, gates + D); \
-  act_cand(D, gates, gates);         \
-  blas.VMUL(D, gates, gates + D, ct)
-
-#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                \
-  act_gate(D, gates + D3, gates + D3);     \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
-  GET_Ct_NOH0C0(gates, ct);                         \
-  /* get outgated, put W_oc * C_t on igated */      \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
-  act_gate(D, gates + D3, gates + D3);              \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
-  act_gate(D3, gates + D, gates + D);     \
-  GET_Ct(ct_1, gates, ct);                \
-  GET_Ht(ct, gates, ht)
-
-#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
-  /* get fgated and igated*/                              \
-  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
-  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
-  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
-  act_gate(D2, gates + D, gates + D);                     \
-  GET_Ct(ct_1, gates, ct);                                \
-  /* get ogated*/                                         \
-  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
-  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
-  act_gate(D, gates + D3, gates + D3);                    \
-  GET_Ht(ct, gates, ht)
-
   void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
-
+    INIT_BASE_DEFINES;
+    INIT_OTHER_DEFINES;
     auto x_lod = x->lod();
     const int total_T = x_dims[0];
     const int N = x_lod[0].size() - 1;
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       gate_offset = -D;
     }
 
-#define MOVE_ONE_STEP                    \
-  prev_h_data = h_out_data;              \
-  prev_c_data = c_out_data;              \
-  xx_data = xx_data + xx_offset;         \
-  h_out_data = h_out_data + gate_offset; \
-  c_out_data = c_out_data + gate_offset
-
-#define PROCESS_H0C0_DEFINES                       \
-  int bid = is_reverse ? N - 1 - i : i;            \
-  int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
-  const T* prev_c_data = nullptr;                  \
-  const T* prev_h_data = nullptr;                  \
-  int tstart = 0
-
-#define PROCESS_H0C0_PEEPHOLE                                      \
-  PROCESS_H0C0_DEFINES;                                            \
-  if (h0_data) {                                                   \
-    prev_h_data = h0_data + bid * D;                               \
-    prev_c_data = c0_data + bid * D;                               \
-  } else {                                                         \
-    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                                 \
-    tstart = 1;                                                    \
-  }
-
-#define PROCESS_H0C0                                      \
-  PROCESS_H0C0_DEFINES;                                   \
-  if (h0_data) {                                          \
-    prev_h_data = h0_data + bid * D;                      \
-    prev_c_data = c0_data + bid * D;                      \
-  } else {                                                \
-    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
-    MOVE_ONE_STEP;                                        \
-    tstart = 1;                                           \
-  }
-
-    if (use_peepholes) {
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0_PEEPHOLE
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
-      }
-    } else {
-      // TODO(TJ): unly workaround, clean me
-      std::function<void(T*, const T*, T*, T*)> compute_ctht;
-      if (platform::jit::MayIUse(platform::jit::avx) &&
-          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
-          act_cell_str == "tanh" && D == 8) {
-        compute_ctht = math::lstm_compute_ctht<T>;
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_c_data = nullptr;
+      const T* prev_h_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_h_data = h0_data + bid * D;
+        prev_c_data = c0_data + bid * D;
       } else {
-        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
-          COMPUTE_CtHt(gates, ct_1, ct, ht);
-        };
+        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        tstart = 1;
+        // move one step
+        prev_h_data = h_out_data;
+        prev_c_data = c_out_data;
+        xx_data = xx_data + xx_offset;
+        h_out_data = h_out_data + gate_offset;
+        c_out_data = c_out_data + gate_offset;
       }
-      for (int i = 0; i < N; ++i) {
-        PROCESS_H0C0
-        for (int step = tstart; step < seq_len; ++step) {
-          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data);
-          MOVE_ONE_STEP;
-        }
+      for (int step = tstart; step < seq_len; ++step) {
+        GEMM_WH_ADDON(1, prev_h_data, xx_data);
+        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
+                         checked_cell_data);
+        // move one step
+        prev_h_data = h_out_data;
+        prev_c_data = c_out_data;
+        xx_data = xx_data + xx_offset;
+        h_out_data = h_out_data + gate_offset;
+        c_out_data = c_out_data + gate_offset;
       }
     }
-#undef PROCESS_H0C0_DEFINES
-#undef PROCESS_H0C0_PEEPHOLE
-#undef PROCESS_H0C0
-#undef MOVE_ONE_STEP
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = platform::CPUDeviceContext;
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
+    INIT_BASE_DEFINES;
     if (x->lod()[0].size() == 2) {
       xx->Resize({x_dims[0], D4});
       SeqCompute(ctx);
       return;
     }
-    INIT_VEC_FUNC
-    INIT_BASE_INPUT_DATAS
+    INIT_OTHER_DEFINES;
 
     auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       prev_c_data = reordered_c0_data;
       size_t sz = sizeof(T) * D;
       for (int i = 0; i < max_bs; ++i) {
-        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
-        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
+        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
         reordered_h0_data += D;
         reordered_c0_data += D;
       }
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_h_out_data = batched_h_out_data;
       T* cur_c_out_data = batched_c_out_data;
       for (int i = 0; i < max_bs; ++i) {
-        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
-        if (use_peepholes) {
-          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
-          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
-        }
-        act_gate(D, cur_in_data + D3, cur_in_data + D3);
-        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
+        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       prev_h_data = batched_h_out_data;
       prev_c_data = batched_c_out_data;
     }
+
+    // compute kernel part
     const auto& batch_starts = batched_lod[0];
     const int max_seq_len = batch_starts.size() - 1;
     const int offset = tstart * max_bs * D;
     batched_input_data = batched_input_data + offset * 4;
     batched_h_out_data = batched_h_out_data + offset;
     batched_c_out_data = batched_c_out_data + offset;
-
-#define DEFINE_CUR                        \
-  T* cur_in_data = batched_input_data;    \
-  T* cur_prev_c_data = prev_c_data;       \
-  T* cur_c_out_data = batched_c_out_data; \
-  T* cur_h_out_data = batched_h_out_data
-
-#define MOVE_ONE_BATCH  \
-  cur_in_data += D4;    \
-  cur_prev_c_data += D; \
-  cur_c_out_data += D;  \
-  cur_h_out_data += D
-
-#define MOVE_ONE_STEP                  \
-  prev_c_data = batched_c_out_data;    \
-  prev_h_data = batched_h_out_data;    \
-  batched_c_out_data = cur_c_out_data; \
-  batched_h_out_data = cur_h_out_data; \
-  batched_input_data = cur_in_data
-
-    if (use_peepholes) {
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                                cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
-      }
-    } else {
-      // TODO(TJ): unly workaround, clean me
-      std::function<void(T*, const T*, T*, T*)> compute_ctht;
-      if (platform::jit::MayIUse(platform::jit::avx) &&
-          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
-          act_cell_str == "tanh" && D == 8) {
-        compute_ctht = math::lstm_compute_ctht<T>;
-      } else {
-        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
-          COMPUTE_CtHt(gates, ct_1, ct, ht);
-        };
-      }
-      for (int step = tstart; step < max_seq_len; ++step) {
-        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
-        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-        DEFINE_CUR;
-        for (int i = 0; i < cur_bs; ++i) {
-          compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                       cur_h_out_data);
-          MOVE_ONE_BATCH;
-        }
-        MOVE_ONE_STEP;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+      T* cur_in_data = batched_input_data;
+      T* cur_prev_c_data = prev_c_data;
+      T* cur_c_out_data = batched_c_out_data;
+      T* cur_h_out_data = batched_h_out_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                         cur_h_out_data, wp_data, checked_cell_data);
+        // move one batch
+        cur_in_data += D4;
+        cur_prev_c_data += D;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
       }
+      // move one step
+      prev_c_data = batched_c_out_data;
+      prev_h_data = batched_h_out_data;
+      batched_c_out_data = cur_c_out_data;
+      batched_h_out_data = cur_h_out_data;
+      batched_input_data = cur_in_data;
     }
-#undef MOVE_ONE_STEP
-#undef MOVE_ONE_BATCH
-#undef DEFINE_CUR
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batched_h_out->set_lod(batched_lod);
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     }
   }
 
-#undef COMPUTE_CtHt_PEEPHOLE
-#undef COMPUTE_CtHt
-#undef GET_Ct_NOH0C0
-#undef COMPUTE_CtHt_NOH0C0
-#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
-#undef GET_Ht
-#undef GET_Ct
 #undef GEMM_WH_ADDON
-#undef INIT_BASE_INPUT_DATAS
-#undef INIT_BASE_SIZES
-#undef INIT_BASE_INPUT_OUTPUT
-#undef INIT_VEC_FUNC
+#undef INIT_OTHER_DEFINES
+#undef INIT_BASE_DEFINES
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0910dc19edb246d9acfe3bdb15071c64cbdaba7
--- /dev/null
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h"
+#include <algorithm>  // for min, max
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionSeqConvEltAddReluOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Input(X) of FusionSeqConvEltAddReluOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasInput("Filter"),
+      "Input(Filter) of FusionSeqConvEltAddReluOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasInput("Bias"),
+      "Input(Bias) of FusionSeqConvEltAddReluOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      "Output(Out) of FusionSeqConvEltAddReluOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("ColMat"),
+      "Output(ColMat) of FusionSeqConvEltAddReluOp should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  auto w_dims = ctx->GetInputDim("Filter");
+  int context_length = ctx->Attrs().Get<int>("contextLength");
+  PADDLE_ENFORCE(
+      ctx->Attrs().Get<int>("contextStride") == 1,
+      "Currently, FusionSeqConvEltAddReluOp only supports contextStride=1.");
+  PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
+                 "Input(X, Filter) should be 2-D tensor.");
+  PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
+                 "Input(X, Filter) should be 2-D tensor.");
+  PADDLE_ENFORCE(w_dims[0] == context_length * x_dims[1],
+                 "Filter's height should be context_length * "
+                 "input_hidden_size .");
+  PADDLE_ENFORCE_GT(context_length + ctx->Attrs().Get<int>("contextStart"), 0,
+                    "contextStart size should be smaller than contextLength.");
+
+  ctx->SetOutputDim("Out", {x_dims[0], w_dims[1]});
+  ctx->SetOutputDim("ColMat", {x_dims[0], w_dims[0]});
+  ctx->ShareLoD("X", "Out");
+}
+
+framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionSeqConvEltAddReluOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  // PaddingData only support false yet, should be ensured at pass.
+  AddInput("Filter",
+           "(Tensor) same as the input(Filter) of sequence conv op is an "
+           "learnable parameter."
+           "This is a tensor with shape (K, N), where K is the "
+           "context_length * dim size of x, N is the output feature size.");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. shape (1, N), where N is the "
+           "output feature size");
+  AddOutput(
+      "Out",
+      "(LoDTensor) the output(Out) is a LodTensor, which support "
+      "variable-time length output sequence. The underlying tensor in "
+      "this LoDTensor is a matrix with shape (T, N), where, T is the "
+      "total time steps in this mini-batch, N is the output feature size.");
+  AddOutput("ColMat",
+            "(Tensor) (T, K), where T is where T is the "
+            "total time steps in this mini-batch, K is height of Filter")
+      .AsIntermediate();
+  AddAttr<int>("contextLength",
+               "(int) the contextLength of FusionSeqConvEltAddReluOp is the "
+               "height of the convolution kernel.")
+      .GreaterThan(0);
+  AddAttr<int>("contextStart",
+               "(int, default:0) the contextStart of FusionSeqConvEltAddReluOp "
+               "represents the beginning of the convolution of the number of "
+               "rows of sequence, which can be negative. The negative number "
+               "means to pad contextStart time-steps of zeros or learnable "
+               "parameters at the beginning of each instance. The positive "
+               "number means to skip contextStart time-steps of each "
+               "instance.")
+      .SetDefault(0);
+  AddAttr<int>(
+      "contextStride",
+      "(int, default:1) the contextStride of FusionSeqConvEltAddReluOp "
+      "represents the stride length of convolution kernel. "
+      "Currently, FusionSeqConvEltAddReluOp only supports"
+      "contextStride=1.")
+      .SetDefault(1)
+      .GreaterThan(0);
+  AddComment(R"DOC(
+Fusion Sequence Conv and ElementwiseAdd Operator.
+)DOC");
+}
+
+template <typename T>
+class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("Filter");
+    auto* b = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<LoDTensor>("Out");
+    auto* col = ctx.Output<Tensor>("ColMat");
+
+    auto x_lod = x->lod();
+    auto x_dims = x->dims();
+    auto w_dims = w->dims();
+    PADDLE_ENFORCE_EQ(b->numel(), w_dims[1],
+                      "bias size should be equal to output feature size.");
+    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL,
+                      "Only support one level sequence now.");
+
+    const T* x_data = x->data<T>();
+    const T* w_data = w->data<T>();
+    const T* b_data = b->data<T>();
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
+    T* col_data = col->mutable_data<T>(ctx.GetPlace());
+
+    int context_start = ctx.Attr<int>("contextStart");
+    int context_length = ctx.Attr<int>("contextLength");
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    // im2col
+    int src_mat_w = static_cast<int>(x_dims[1]);
+    int src_mat_w_sz = src_mat_w * sizeof(T);
+    int col_mat_w = static_cast<int>(w_dims[0]);
+    int col_mat_w_sz = col_mat_w * sizeof(T);
+    for (int i = 0; i < static_cast<int>(x_lod[0].size()) - 1; ++i) {
+      int st = x_lod[0][i];
+      int ed = x_lod[0][i + 1];
+      const T* src_data = x_data + st * src_mat_w;
+      T* dst_data = col_data + st * col_mat_w;
+      int seq_len = ed - st;
+      if (seq_len > up_pad + down_pad) {
+        // zero all up_pad and fill data
+        std::memset(dst_data, 0, up_pad * col_mat_w_sz);
+        dst_data = dst_data + up_pad * src_mat_w;
+        int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz;
+        for (int j = 0; j < up_pad; ++j) {
+          // blas.VCOPY?
+          std::memcpy(dst_data, src_data, copy_size);
+          dst_data += (col_mat_w - src_mat_w);
+          copy_size += src_mat_w_sz;
+        }
+        // fill data
+        for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
+          std::memcpy(dst_data, src_data, copy_size);
+          dst_data += col_mat_w;
+          src_data += src_mat_w;
+        }
+        // zero all down_pad and fill data
+        std::memset(dst_data, 0, down_pad * col_mat_w_sz);
+        copy_size -= src_mat_w_sz;
+        for (int j = 0; j < down_pad; ++j) {
+          std::memcpy(dst_data, src_data, copy_size);
+          dst_data += col_mat_w;
+          src_data += src_mat_w;
+          copy_size -= src_mat_w_sz;
+        }
+      } else {
+        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1);
+        std::memset(dst_data, 0, seq_len * col_mat_w_sz);
+        dst_data = dst_data + up_pad * src_mat_w;
+        int zero_sz = up_pad * src_mat_w_sz;
+        int cur_src_sz = seq_len * src_mat_w_sz;
+        for (int j = 0; j < std::min(up_pad, seq_len); ++j) {
+          int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          std::memcpy(dst_data, src_data, copy_size);
+          dst_data += (col_mat_w - src_mat_w);
+          zero_sz -= src_mat_w_sz;
+        }
+        // from bottom
+        dst_data = col_data + ed * col_mat_w;
+        src_data = x_data + st * src_mat_w;
+        zero_sz = down_pad * src_mat_w_sz;
+        for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
+          int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
+                      src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
+                      copy_size);
+          dst_data -= col_mat_w;
+          zero_sz -= src_mat_w_sz;
+        }
+      }
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    math::FCCompute<DeviceContext, T>(blas, x_dims[0], w_dims[1], w_dims[0],
+                                      col_data, w_data, y_data, b_data, true);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp,
+                  ops::FusionSeqConvEltAddReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu,
+                       ops::FusionSeqConvEltAddReluKernel<float>,
+                       ops::FusionSeqConvEltAddReluKernel<double>);
diff --git a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..028d79dc2a1ee8d789fe4b8724b320442041a71b
--- /dev/null
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionSeqConvEltAddReluOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
index 0cd3d3887cf5167c779a8b20442fdb458cd7eab4..8d2f055d53a0c5bbef624ff3b01b01724d0b3a21 100644
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
@@ -136,9 +136,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     // since infershape can not get lod info
     PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
     PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
-    PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
+    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0].size() - 1), N,
                       "Batch size of all inputs should be equal.");
-    PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
+    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0][N]), N,
                       "Seq_length of other inputs should be 1.");
     PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
     for (size_t i = 2; i < ins.size(); ++i) {
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index d15cb55647ade2415041b11099974484835f55eb..d72e07d76c97e9e455e54980207d7c02842cc04b 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
   PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
+  int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
 
   const T* p_src = src.data<T>();
   const int* p_index = index.data<int>();
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
 
   const size_t slice_bytes = slice_size * sizeof(T);
 
-  for (int i = 0; i < index_size; ++i) {
+  for (int64_t i = 0; i < index_size; ++i) {
     int index_ = p_index[i];
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 089b541a0a61adb5efda6b2e027c913d5808dff0..f84ff206fffddef1030b7ed439e887bdfef342a6 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
+                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
+                       ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<double>);
+                       ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 7e014dd1cb47ee0575308dc13ba7bc7617baebff..9f4aef08cd58e72ce344a640e6564b9e360ce169 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
+                        ops::GatherOpCUDAKernel<double>,
+                        ops::GatherOpCUDAKernel<int64_t>,
+                        ops::GatherOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
+                        ops::GatherGradOpCUDAKernel<double>,
+                        ops::GatherGradOpCUDAKernel<int64_t>,
+                        ops::GatherGradOpCUDAKernel<int>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 1488aab1926b5b4ba7bceed582700f5a11fc6c93..c70d5b8bc7569c38cbc003aca7c62dc503df11cf 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -52,7 +52,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GaussianRandomOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -88,9 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddOutput("Out", "Output matrix of gaussian random op");
 
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "The dimension of random tensor.");
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) "
+                                  "The dimension of random tensor.");
     AddAttr<float>("mean",
                    "(float, default 0.0) "
                    "mean of random tensor.")
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ebe71a3d7ae270a10a45f4805652415078b363
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hash_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class HashOp : public framework::OperatorWithKernel {
+ public:
+  HashOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of HashOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of HashOp should not be null.");
+
+    auto dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(dims.size(), 2UL,
+                      "The input of hash_op's dimensions must be 2");
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(dims.size() + 1);
+    // copy all dims except the last one
+    for (size_t i = 0u; i != dims.size() - 1; ++i) {
+      out_dims.emplace_back(dims[i]);
+    }
+    int num_hash = ctx->Attrs().Get<int>("num_hash");
+    out_dims.emplace_back(num_hash);
+    // keep the last dim to 1
+    out_dims.emplace_back(1);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class HashOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+**Hash Operator**
+$$Out = scale * X$$
+)DOC");
+    AddAttr<int>("num_hash", "").SetDefault(1);
+    AddAttr<int>("mod_by", "").SetDefault(100000);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
+REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9781bb0f453642cefb3eb59a05389c339a7de39d
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+extern "C" {
+#include <xxhash.h>
+}
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+// template <typename DeviceContext, typename T>
+template <typename T>
+class HashKerel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out_t = context.Output<framework::LoDTensor>("Out");
+    auto* in_t = context.Input<framework::LoDTensor>("X");
+    int mod_by = context.Attr<int>("mod_by");
+    int num_hash = context.Attr<int>("num_hash");
+    auto* output = out_t->mutable_data<T>(context.GetPlace());
+
+    auto in_dims = in_t->dims();
+    auto in_lod = in_t->lod();
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    auto seq_length = in_dims[0];
+    auto last_dim = in_dims[in_dims.size() - 1];
+    auto* input = in_t->data<T>();
+    for (int idx = 0; idx < seq_length; ++idx) {
+      for (int ihash = 0; ihash != num_hash; ++ihash) {
+        output[idx * num_hash + ihash] =
+            XXH64(input, sizeof(int) * last_dim, ihash) % mod_by;
+      }
+      input += last_dim;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 248c7793560db99c0af06421bf74808422016061..7b42efd623b31a703bf51d2d157130b3120b42a4 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) 1-dim tensor, contains a bool scalar. The output "
               "tensor of overflow operator.");
     AddComment(string::Sprintf(R"DOC(
-Overflow operator.
+Overflow %s operator.
 
 $$Out = any(X)$$
 
@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
 Out = Nan if any X contains Nan,
 Out = 0 if no Inf/Nan detected.
 If X contains both Inf/Nan, it will return the first indicator it meeted.
+
+%s
 )DOC",
                                GetName(), GetComments()));
   }
diff --git a/paddle/fluid/operators/lars_momentum_op.cc b/paddle/fluid/operators/lars_momentum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8dda93902448fa1bd21b719ffd9c9b500caf755
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lars_momentum_op.h"
+#include "paddle/fluid/operators/momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input learning rate");
+
+    AddOutput("ParamOut",
+              "(LoDTensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(LoDTensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
+
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
+        .SetDefault(0.001);
+    AddAttr<float>("lars_weight_decay",
+                   "(float, default 0.0005) LARS weight decay")
+        .SetDefault(0.0005);
+
+    AddComment(R"DOC(
+Lars Momentum Optimizer.
+
+This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
+weight using a local learning rate:
+
+$$
+local\_lr = \eta  * 
+    \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
+velocity = mu * velocity + 
+    local\_lr * (grad + \beta * param) \\
+param = param - velocity. \\
+$$
+
+Note that we use lars_weight_decay here to decay weights, you may need not to
+use L2 regularizers in case of using LARS.
+
+)DOC");
+  }
+};
+
+class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::LarsMomentumOpVarTypeInference);
+REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
+                       ops::LarsMomentumOpKernel<double>);
diff --git a/paddle/fluid/operators/lars_momentum_op.cu b/paddle/fluid/operators/lars_momentum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eb346851a2f690fa05422c84ddcb08307539048f
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.cu
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/lars_momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
+                                   const T* learning_rate, const T mu,
+                                   const int64_t num, const T lars_coeff,
+                                   const T lars_weight_decay, const T* p_norm,
+                                   const T* g_norm, T* p_out, T* v_out) {
+  T lr = learning_rate[0];
+  T local_lr = learning_rate[0];
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    if (p_norm[0] > 0 && g_norm[0] > 0) {
+      local_lr = lr * lars_coeff * p_norm[0] /
+                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+    }
+    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
+    v_out[i] = v_new;
+    p_out[i] = p[i] - v_new;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
+    auto param = ctx.Input<framework::LoDTensor>("Param");
+    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
+    auto grad = ctx.Input<framework::LoDTensor>("Grad");
+    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    T lars_coeff = ctx.Attr<float>("lars_coeff");
+    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+
+    auto eigen_p = framework::EigenVector<T>::Flatten(*param);
+    auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
+    // calculate norms using eigein and launch the kernel.
+    framework::Tensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+    ep_norm.device(*place) = eigen_p.square().sum().sqrt();
+    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
+    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
+        p_norm_data, g_norm_data, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lars_momentum,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lars_momentum_op.h b/paddle/fluid/operators/lars_momentum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e85be99fc42522e461a7915847d82144d8195a96
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class LarsMomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
+    auto param = ctx.Input<framework::LoDTensor>("Param");
+    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
+    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+    auto* grad_var = ctx.InputVar("Grad");
+    // only support dense for now.
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>());
+    auto grad = ctx.Input<framework::LoDTensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    T lars_coeff = ctx.Attr<float>("lars_coeff");
+    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+
+    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
+    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
+
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto v = framework::EigenVector<T>::Flatten(*velocity);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto* lr = learning_rate->data<T>();
+
+    framework::Tensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    p_norm_t.mutable_data<T>(ctx.GetPlace());
+    g_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+
+    ep_norm = p.square().sum().sqrt();
+    eg_norm = g.square().sum().sqrt();
+    T local_lr = lr[0];
+    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+      local_lr = lr[0] * lars_coeff * ep_norm(0) /
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+    }
+    v_out = v * mu + local_lr * (g + lars_weight_decay * p);
+    p_out = p - v_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index dc008d16971bc762b401ddece56f9ec56f7a47d6..a038bad701ba8ede3065af9f352f1f21784a50b7 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 
+DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
+DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
+DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
+
 namespace paddle {
 namespace operators {
 
@@ -66,7 +70,7 @@ static void ParallelExecuteBlocks(
                 << "pointer: " << prepared[run_block].get();
         executor->RunPreparedContext(prepared[run_block].get(), scope);
       } catch (const std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
+        LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
       }
     }));
   }
@@ -332,11 +336,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       sync_mode, checkpoint_block_id));
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get());
+                            request_send_handler_.get(),
+                            FLAGS_rpc_send_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get());
+                            request_get_handler_.get(),
+                            FLAGS_rpc_get_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get());
+                            request_prefetch_handler_.get(),
+                            FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 8eab83fcd247fcd099ae1fa5dab1e67c2081bf9c..e72337a3e6f7884c3a05372e8732647e5910f3e4 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/port.h"
 
@@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
 template <typename DeviceContext>
 template <typename T>
 void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
-  math::ConcatGradFunctor<DeviceContext, T> func;
+  math::SplitFunctor<DeviceContext, T> func;
   func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
        &prev_functor_->outputs_);
 }
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index b9ac54e446811889b647397ae1fbb11c28f46777..3226a727b1f5f6de9e97ce2068381be7c9b69ff3 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -115,7 +121,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out"));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 58463dc4d6fd7cc3454de766814a947fee161070..e504c4f0cd5c0feaef4a251fad57b389a10a2ce7 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
       const auto *table = table_t.value().data<T>();
       auto *output = output_t->mutable_data<T>(context.GetPlace());
 
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
       for (int64_t i = 0; i < ids_numel; ++i) {
         if (padding_idx != kNoPadding && ids[i] == padding_idx) {
           memset(output + i * row_width, 0, row_width * sizeof(T));
@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(ids[i], 0);
           auto id_index = table_t.Index(ids[i]);
           PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          memcpy(output + i * row_width, table + id_index * row_width,
-                 row_width * sizeof(T));
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
         }
       }
     }
@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
 
-      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_num);
-      for (int64_t i = 0; i < ids_num; i++) {
-        new_rows.push_back(ids_data[i]);
-      }
+      std::vector<int64_t> new_rows;
+      new_rows.resize(ids_num);
+      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_num, table_dim[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table_value->data<T>();
-
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      // FIXME(minqiyang):
+      // memory optimization will NOT reuse Tensor with SelectedRows
+      // so we could just share the tensor here directly.
+      // However, the InferVarType method will infer the output SelectedRows
+      // to Tensor sometimes, which is a bug, so we will add an attribute
+      // here to indicate the inplace and remove this attribute after
+      // the InferVarType's bug was fixed
+      bool grad_inplace = context.Attr<bool>("grad_inplace");
+      if (grad_inplace) {
+        d_table_value->ShareDataWith(*d_output);
+      } else {
+        d_table_value->mutable_data<T>(context.GetPlace());
+
+        d_table->set_height(table_dim[0]);
+
+        auto *d_output_data = d_output->data<T>();
+        auto *d_table_data = d_table_value->data<T>();
+
+        auto d_output_dims = d_output->dims();
+        PADDLE_ENFORCE_EQ(
+            d_table_value->dims(),
+            framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
+        memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      }
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index b0276f4080b3989047f9d181e9d37a18dcfad5fa..17b675fba8067851f6149edafcc9096690a3fd34 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT WIN32)
-add_subdirectory(detail)
+    add_subdirectory(detail)
 endif(NOT WIN32)
 
 function(math_library TARGET)
@@ -35,7 +35,7 @@ function(math_library TARGET)
 endfunction()
 
 # please add new math_library in alphabetical order
-math_library(concat)
+math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
@@ -43,10 +43,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 
 if (NOT WIN32) # windows do not support avx functions yet.
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
-# TODO(TJ): ugly workaround, clean me
-cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
+    math_library(gru_compute DEPS activation_functions math_function)
+    math_library(lstm_compute DEPS activation_functions)
 endif (NOT WIN32)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
@@ -60,7 +58,7 @@ math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
 if (NOT WIN32)
-math_library(matrix_bit_code)
+    math_library(matrix_bit_code)
 endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
@@ -70,9 +68,14 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
+cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
-cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+cc_library(jit_kernel 
+    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
+    DEPS cpu_info cblas)
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e75b6abce5e1f43742ee15bff1dac4801186cd4
--- /dev/null
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>  // for int64_t
+#include <numeric>
+
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
+  int64_t beg = 0, end = num - 1;
+  while (beg <= end) {
+    auto mid = ((beg + end) >> 1);
+    if (x[mid] == val)
+      return mid;
+    else if (x[mid] < val)
+      beg = mid + 1;
+    else
+      end = mid - 1;
+  }
+  return -1;
+}
+
+template <typename T>
+HOSTDEVICE inline size_t LowerBound(const T *x, size_t num, const T &val) {
+#ifdef __CUDA_ARCH__
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/lower_bound
+  auto *first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    int64_t step = (count >> 1);
+    auto *it = first + step;
+    if (*it < val) {
+      first = ++it;
+      count -= (step + 1);
+    } else {
+      count = step;
+    }
+  }
+  return static_cast<size_t>(first - x);
+#else
+  return static_cast<size_t>(std::lower_bound(x, x + num, val) - x);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline size_t UpperBound(const T *x, size_t num, const T &val) {
+#ifdef __CUDA_ARCH__
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/upper_bound
+  auto *first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    auto step = (count >> 1);
+    auto *it = first + step;
+    if (val < *it) {
+      count = step;
+    } else {
+      first = ++it;
+      count -= (step + 1);
+    }
+  }
+  return static_cast<size_t>(first - x);
+#else
+  return static_cast<size_t>(std::upper_bound(x, x + num, val) - x);
+#endif
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat_and_split.cc
similarity index 95%
rename from paddle/fluid/operators/math/concat.cc
rename to paddle/fluid/operators/math/concat_and_split.cc
index 7b79f10e33d4474e279c6e46208722d6b52277fc..c6e17fd042f19bbeee3507e4cd64f49cff369682 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include <vector>
 
 namespace paddle {
@@ -67,7 +67,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+class SplitFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
@@ -111,7 +111,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 };
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CPUDeviceContext, type>;
+  template class SplitFunctor<platform::CPUDeviceContext, type>;
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat_and_split.cu
similarity index 90%
rename from paddle/fluid/operators/math/concat.cu
rename to paddle/fluid/operators/math/concat_and_split.cu
index b59d86e661aff25eba8e770247e85845365d628b..760a065c1081d1e55901774b258ba524471b856b 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -24,7 +24,7 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 
 template <typename T>
-__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
                              const int out_rows, const int out_cols,
                              T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -67,9 +67,9 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int in_col, const int* out_cols,
-                                 int out_cols_size, T** outputs_data) {
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
   int curr_offset = out_cols[0];
@@ -94,9 +94,9 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int in_col, const int fixed_out_col,
-                                 T** outputs_data) {
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x / fixed_out_col;
@@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
       const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
@@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+class SplitFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
       const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
@@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
 #define DEFINE_FUNCTOR(type)                                       \
   template class ConcatFunctor<platform::CUDADeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CUDADeviceContext, type>
+  template class SplitFunctor<platform::CUDADeviceContext, type>
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat_and_split.h
similarity index 98%
rename from paddle/fluid/operators/math/concat.h
rename to paddle/fluid/operators/math/concat_and_split.h
index 867a84fa873a2e90bdab7a5eecbb1755cb4b02d1..3a5eddcbf4af699a89ae1a21571337155699a1f3 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -54,7 +54,7 @@ class ConcatFunctor {
  *     Output[1] = [[5,6]]
  */
 template <typename DeviceContext, typename T>
-class ConcatGradFunctor {
+class SplitFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index a46f2d51ca64501a622b5b48b424dffa16efc5b4..8ba9e8e8ec1344edc3beaf7f4a58f99107cc0e9c 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/concat.h"
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 
 template <typename DeviceContext, typename Place>
 void testConcat() {
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc
deleted file mode 100644
index e96d1879331974e0873e13f171414bcfa8c45953..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-#ifdef __AVX__
-template <>
-void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
-                              float* ht) {
-  namespace act = detail::forward::avx;
-  // gates: W_ch, W_ih, W_fh, W_oh
-  __m256 c, i, f, o;
-  c = _mm256_loadu_ps(gates);
-  i = _mm256_loadu_ps(gates + 8);
-  f = _mm256_loadu_ps(gates + 16);
-  o = _mm256_loadu_ps(gates + 24);
-
-  /* C_t = C_t-1 * fgated + cand_gated * igated*/
-  c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
-  i = _mm256_loadu_ps(ct_1);
-  f = _mm256_mul_ps(i, act::Sigmoid(f));
-  f = _mm256_add_ps(c, f);
-  _mm256_storeu_ps(ct, f);
-
-  /* H_t = act_cell(C_t) * ogated */
-  o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
-  _mm256_storeu_ps(ht, o);
-}
-#endif
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h
deleted file mode 100644
index 169a9e4b47f54851ad436428416eca879b78e186..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// TODO(TJ): ugly workaround, clean me
-template <typename T>
-void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
-  // gates: W_ch, W_ih, W_fh, W_oh
-  vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
-  vec_tanh<T, platform::jit::avx>(8, gates, gates);
-  const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int d = 0; d < 8; ++d) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
-    // H_t = act_cell(C_t) * ogated
-    T tmp = ct[d] * 2;
-    tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vec_exp<T>(1, &tmp, &tmp);
-    tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
-    ht[d] = tmp * o[d];
-  }
-}
-
-#ifdef __AVX__
-namespace detail {
-namespace forward {
-namespace avx {
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-
-}  // namespace avx
-}  // namespace forward
-}  // namespace detail
-
-template <>
-void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
-                              float* ht);
-
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 6a059968b79189458349e466079cc7a663a8e5ff..0aed253c80fc28560716cbcfa70f74ef9c84f9b6 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_scal<float, platform::jit::avx512_common>(const int n,
-                                                          const float a,
-                                                          const float* x,
-                                                          float* y) {
+inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
+                                                    const float* x, float* y) {
   // TODO(TJ): enable me
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -181,10 +179,10 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
-                                                              const float a,
-                                                              const float* x,
-                                                              float* y) {
+inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
   // TODO(TJ): enable me
   vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_cross<float, platform::jit::avx512_common>(
+inline void vec_cross<float, platform::jit::avx512f>(
     const int n, const float* x, const float* y, const float* z, float* out) {
   // TODO(TJ): enable me
   vec_cross<float, platform::jit::avx>(n, x, y, z, out);
@@ -296,10 +294,10 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
 }
 
 template <>
-inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
-                                                              const float a,
-                                                              const float* x,
-                                                              float* y) {
+inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
+                                                        const float a,
+                                                        const float* x,
+                                                        float* y) {
   // TODO(TJ): enable me
   vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
 }
@@ -390,9 +388,9 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
-                                                             const float* x,
-                                                             float* y) {
+inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
+                                                       const float* x,
+                                                       float* y) {
   // TODO(TJ): enable me
   vec_sigmoid<float, platform::jit::avx2>(n, x, y);
 }
@@ -454,9 +452,8 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx512_common>(const int n,
-                                                          const float* x,
-                                                          float* y) {
+inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
+                                                    float* y) {
   // TODO(TJ): enable me
   vec_relu<float, platform::jit::avx2>(n, x, y);
 }
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 3ce66f49ed8354c49e8af26ca6eb48fef654a40b..cd40f1b2f984126663a5711efac24fdf6d680b32 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
     TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
                         ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
     TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
-                        ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
     TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
-                        ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
     TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
     TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
                        ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
     TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
-                       ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
   }
   TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
     TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
-                       ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index 1f5a49c0ab5a10b0d7dc1febd258ce76c467cb1c..87220d4019fc9337fb8355172ca4f1372cfd4558 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 
 DECLARE_int32(paddle_num_threads);
 
@@ -30,20 +31,25 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
   if (B == NULL) {
     return;
   }
+  if (relu) {
+    const auto& vaddrelu = jitkernel::KernelPool::Instance()
+                               .template Get<jitkernel::VAddReluKernel<T>>(N);
+    for (int i = 0; i < M; i++) {
+      T* dst = Y + i * N;
+      vaddrelu->Compute(B, dst, dst);
+    }
+  } else {
+    const auto& vadd = jitkernel::KernelPool::Instance()
+                           .template Get<jitkernel::VAddKernel<T>>(N);
 
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
 #endif
-  for (int i = 0; i < M; i++) {
-    blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+    for (int i = 0; i < M; i++) {
+      T* dst = Y + i * N;
+      vadd->Compute(B, dst, dst);
+    }
   }
-
-  if (!relu) {
-    return;
-  }
-
-  // TODO(TJ): fuse relu
-  LOG(FATAL) << "Not implemented!";
 }
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68b708b345334bc63b5e2e88c308d20ca6378e6b
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <iostream>
+#include <string>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+KernelPool& KernelPool::Instance() {
+  static thread_local KernelPool g_jit_kernels;
+  return g_jit_kernels;
+}
+
+std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
+  if (kers_.find(key) == kers_.end()) {
+    return nullptr;
+  }
+  return kers_.at(key);
+}
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..48e180b1fd43b06cc13f7a4b00c73aff2eb940ac
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <memory>  // for shared_ptr
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/macros.h"
+
+// Note: Only support on CPU yet.
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define AVX_FLOAT_BLOCK 8
+#define AVX2_FLOAT_BLOCK 8
+#define AVX512_FLOAT_BLOCK 16
+
+typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
+
+class Kernel {
+ public:
+  Kernel() = default;
+  virtual ~Kernel() = default;
+  int num_{0};
+  int end_{0};
+  int rest_{0};
+  DISABLE_COPY_AND_ASSIGN(Kernel);
+};
+
+class KernelPool {
+ public:
+  static KernelPool &Instance();
+
+  template <typename Ker, typename... ARGS>
+  std::shared_ptr<const Ker> Get(ARGS... args);
+
+  std::shared_ptr<const Kernel> Get(const std::string &key) const;
+
+ private:
+  KernelPool() = default;
+  std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
+
+  DISABLE_COPY_AND_ASSIGN(KernelPool);
+};
+
+template <typename T>
+class VMulKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+};
+
+template <typename T>
+class VAddKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+};
+
+template <typename T>
+class VScalKernel : public Kernel {
+ public:
+  virtual void Compute(const T a, const T *x, T *y) const = 0;
+  virtual void Compute(const T a, T *x) const = 0;
+};
+
+template <typename T>
+class VAddBiasKernel : public Kernel {
+ public:
+  virtual void Compute(const T a, const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VAddReluKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, const T *y, T *z) const = 0;
+};
+
+template <typename T>
+class VActKernel : public Kernel {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VReluKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VIdentityKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VExpKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VSigmoidKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class VTanhKernel : public VActKernel<T> {
+ public:
+  virtual void Compute(const T *x, T *y) const = 0;
+};
+
+template <typename T>
+class LSTMKernel : public Kernel {
+ public:
+  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr,
+                           T *checked = nullptr) const = 0;
+
+  // compute c1 and h1 without c0 or h0
+  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
+                           /* below only used in peephole*/
+                           const T *wp_data = nullptr) const = 0;
+};
+
+template <typename T>
+class GRUKernel : public Kernel {
+ public:
+  // compute h1 without h0
+  virtual void ComputeH1(T *gates, T *ht) const = 0;
+  virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0;
+  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
+};
+
+template <typename T>
+class CRFDecodeKernel : public Kernel {
+ public:
+  virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
+                       int *track) const = 0;
+};
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c88b17b012d1b9cd59220f6b37cb2ecb8a1551a4
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -0,0 +1,479 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+/* VMUL JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VMulKernelImpl : public VMulKernel<T> {
+ public:
+  explicit VMulKernelImpl(int d) : VMulKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, const T* y, T* z) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      z[i] = x[i] * y[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                           \
+  template <>                                           \
+  void VMulKernelImpl<float, isa, block>::Compute(      \
+      const float* x, const float* y, float* z) const { \
+    platform::dynload::vsMul(this->num_, x, y, z);      \
+  }
+
+#define MKL_DOUBLE(isa, block)                             \
+  template <>                                              \
+  void VMulKernelImpl<double, isa, block>::Compute(        \
+      const double* x, const double* y, double* z) const { \
+    platform::dynload::vdMul(this->num_, x, y, z);         \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VMulKernelImpl<float, isa, kEQ8>::Compute(       \
+      const float* x, const float* y, float* z) const { \
+    __m256 tmpx, tmpy;                                  \
+    tmpx = _mm256_loadu_ps(x);                          \
+    tmpy = _mm256_loadu_ps(y);                          \
+    tmpx = _mm256_mul_ps(tmpx, tmpy);                   \
+    _mm256_storeu_ps(z, tmpx);                          \
+  }
+
+// avx > for > mkl
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+#undef INTRI8_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VADD JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddKernelImpl : public VAddKernel<T> {
+ public:
+  explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, const T* y, T* z) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      z[i] = x[i] + y[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                           \
+  template <>                                           \
+  void VAddKernelImpl<float, isa, block>::Compute(      \
+      const float* x, const float* y, float* z) const { \
+    platform::dynload::vsAdd(this->num_, x, y, z);      \
+  }
+
+#define MKL_DOUBLE(isa, block)                             \
+  template <>                                              \
+  void VAddKernelImpl<double, isa, block>::Compute(        \
+      const double* x, const double* y, double* z) const { \
+    platform::dynload::vdAdd(this->num_, x, y, z);         \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VAddKernelImpl<float, isa, kEQ8>::Compute(       \
+      const float* x, const float* y, float* z) const { \
+    __m256 tmpx, tmpy;                                  \
+    tmpx = _mm256_loadu_ps(x);                          \
+    tmpy = _mm256_loadu_ps(y);                          \
+    tmpx = _mm256_add_ps(tmpx, tmpy);                   \
+    _mm256_storeu_ps(z, tmpx);                          \
+  }
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VSCAL JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VScalKernelImpl : public VScalKernel<T> {
+ public:
+  explicit VScalKernelImpl(int d) : VScalKernel<T>() { this->num_ = d; }
+  void Compute(const T a, const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = a * x[i];
+    }
+  }
+  void Compute(const T a, T* x) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      x[i] = a * x[i];
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                                               \
+  template <>                                                               \
+  void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
+      const {                                                               \
+    platform::dynload::cblas_sscal(this->num_, a, x, 1);                    \
+  }
+
+#define MKL_DOUBLE(isa, block)                                                 \
+  template <>                                                                  \
+  void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
+      const {                                                                  \
+    platform::dynload::cblas_dscal(this->num_, a, x, 1);                       \
+  }
+
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+#define INTRI8_FLOAT(isa)                              \
+  template <>                                          \
+  void VScalKernelImpl<float, isa, kEQ8>::Compute(     \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp;                                        \
+    __m256 scalar = _mm256_set1_ps(a);                 \
+    tmp = _mm256_loadu_ps(x);                          \
+    tmp = _mm256_mul_ps(tmp, scalar);                  \
+    _mm256_storeu_ps(y, tmp);                          \
+  }
+#define INTRI8_INPLACE_FLOAT(isa)                                          \
+  template <>                                                              \
+  void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
+      const {                                                              \
+    __m256 tmp;                                                            \
+    __m256 scalar = _mm256_set1_ps(a);                                     \
+    tmp = _mm256_loadu_ps(x);                                              \
+    tmp = _mm256_mul_ps(tmp, scalar);                                      \
+    _mm256_storeu_ps(x, tmp);                                              \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI8_INPLACE_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI8_INPLACE_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI8_INPLACE_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI8_INPLACE_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+/* VAddBias JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddBiasKernelImpl : public VAddBiasKernel<T> {
+ public:
+  explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() { this->num_ = d; }
+  void Compute(const T a, const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = x[i] + a;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                              \
+  template <>                                          \
+  void VAddBiasKernelImpl<float, isa, kEQ8>::Compute(  \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp = _mm256_loadu_ps(x);                   \
+    tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a));       \
+    _mm256_storeu_ps(y, tmp);                          \
+  }
+
+#define INTRI16_FLOAT(isa)                             \
+  template <>                                          \
+  void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
+      const float a, const float* x, float* y) const { \
+    __m256 tmp0 = _mm256_loadu_ps(x);                  \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);              \
+    tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a));     \
+    tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a));     \
+    _mm256_storeu_ps(y, tmp0);                         \
+    _mm256_storeu_ps(y + 8, tmp1);                     \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+
+/* VRelu JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VReluKernelImpl : public VReluKernel<T> {
+ public:
+  explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = x[i] > 0 ? x[i] : 0;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                                                   \
+  template <>                                                               \
+  void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp = _mm256_loadu_ps(x);                                        \
+    tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());                          \
+    _mm256_storeu_ps(y, tmp);                                               \
+  }
+
+#define INTRI16_FLOAT(isa)                                                   \
+  template <>                                                                \
+  void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 zeros = _mm256_setzero_ps();                                      \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
+    tmp0 = _mm256_max_ps(tmp0, zeros);                                       \
+    tmp1 = _mm256_max_ps(tmp1, zeros);                                       \
+    _mm256_storeu_ps(y, tmp0);                                               \
+    _mm256_storeu_ps(y + 8, tmp1);                                           \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa)                                        \
+  template <>                                                           \
+  VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d)         \
+      : VReluKernel<float>() {                                          \
+    this->num_ = d;                                                     \
+    this->end_ = AVX_FLOAT_BLOCK;                                       \
+    this->rest_ = d - AVX_FLOAT_BLOCK;                                  \
+  }                                                                     \
+  template <>                                                           \
+  void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,   \
+                                                      float* y) const { \
+    __m256 zeros = _mm256_setzero_ps();                                 \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + this->rest_);                     \
+    tmp0 = _mm256_max_ps(tmp0, zeros);                                  \
+    tmp1 = _mm256_max_ps(tmp1, zeros);                                  \
+    _mm256_storeu_ps(y, tmp0);                                          \
+    _mm256_storeu_ps(y + this->rest_, tmp1);                            \
+  }
+
+#define INTRI_GT16_FLOAT(isa)                                                \
+  template <>                                                                \
+  VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d)                 \
+      : VReluKernel<float>() {                                               \
+    this->num_ = d;                                                          \
+    this->end_ = d - d % AVX_FLOAT_BLOCK;                                    \
+    this->rest_ = d - AVX_FLOAT_BLOCK;                                       \
+  }                                                                          \
+  template <>                                                                \
+  void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 zeros = _mm256_setzero_ps();                                      \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
+      tmp = _mm256_max_ps(tmp, zeros);                                       \
+      _mm256_storeu_ps(y + i, tmp);                                          \
+    }                                                                        \
+    __m256 tmp = _mm256_loadu_ps(x + this->rest_);                           \
+    tmp = _mm256_max_ps(tmp, zeros);                                         \
+    _mm256_storeu_ps(y + this->rest_, tmp);                                  \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_GT8LT16_FLOAT(jit::avx);
+INTRI_GT16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+INTRI_GT8LT16_FLOAT(jit::avx2);
+INTRI_GT16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+// TODO(TJ): refine avx512
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+INTRI_GT8LT16_FLOAT(jit::avx512f);
+INTRI_GT16_FLOAT(jit::avx512f);
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+
+/* An empty JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VIdentityKernelImpl : public VIdentityKernel<T> {
+ public:
+  explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {}
+};
+
+/* VAddRelu JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddReluKernelImpl : public VAddReluKernel<T> {
+ public:
+  explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, const T* y, T* z) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      z[i] = x[i] + y[i];
+      z[i] = z[i] > 0 ? z[i] : 0;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VAddReluKernelImpl<float, isa, kEQ8>::Compute(   \
+      const float* x, const float* y, float* z) const { \
+    __m256 tmpx = _mm256_loadu_ps(x);                   \
+    __m256 tmpy = _mm256_loadu_ps(y);                   \
+    tmpy = _mm256_add_ps(tmpx, tmpy);                   \
+    tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps());    \
+    _mm256_storeu_ps(z, tmpy);                          \
+  }
+
+#define INTRI16_FLOAT(isa)                              \
+  template <>                                           \
+  void VAddReluKernelImpl<float, isa, kEQ16>::Compute(  \
+      const float* x, const float* y, float* z) const { \
+    __m256 zeros = _mm256_setzero_ps();                 \
+    __m256 tmp0 = _mm256_loadu_ps(x);                   \
+    __m256 tmp1 = _mm256_loadu_ps(y);                   \
+    tmp0 = _mm256_add_ps(tmp0, tmp1);                   \
+    tmp0 = _mm256_max_ps(tmp0, zeros);                  \
+    tmp1 = _mm256_loadu_ps(x + 8);                      \
+    __m256 tmp2 = _mm256_loadu_ps(y + 8);               \
+    tmp1 = _mm256_add_ps(tmp1, tmp2);                   \
+    tmp1 = _mm256_max_ps(tmp1, zeros);                  \
+    _mm256_storeu_ps(z, tmp0);                          \
+    _mm256_storeu_ps(z + 8, tmp1);                      \
+  }
+
+#define INTRI_COMMON_FLOAT(isa, block)                             \
+  template <>                                                      \
+  VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
+      : VAddReluKernel<float>() {                                  \
+    this->num_ = d;                                                \
+    this->end_ = d - d % AVX_FLOAT_BLOCK;                          \
+    this->rest_ = d - this->end_;                                  \
+  }                                                                \
+  template <>                                                      \
+  void VAddReluKernelImpl<float, isa, block>::Compute(             \
+      const float* x, const float* y, float* z) const {            \
+    __m256 zeros = _mm256_setzero_ps();                            \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {        \
+      __m256 tmpx = _mm256_loadu_ps(x + i);                        \
+      __m256 tmpy = _mm256_loadu_ps(y + i);                        \
+      tmpy = _mm256_add_ps(tmpx, tmpy);                            \
+      tmpy = _mm256_max_ps(tmpy, zeros);                           \
+      _mm256_storeu_ps(z + i, tmpy);                               \
+    }                                                              \
+    for (int i = this->end_; i < this->num_; ++i) {                \
+      z[i] = x[i] + y[i];                                          \
+      z[i] = z[i] > 0 ? z[i] : 0;                                  \
+    }                                                              \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_COMMON_FLOAT(jit::avx, kGT16);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+INTRI_COMMON_FLOAT(jit::avx2, kGT16);
+#endif
+#ifdef __AVX512F__
+// TODO(TJ): refine avx512
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_COMMON_FLOAT
+
+REGISTER_JITKERNEL(vmul, VMulKernel);
+REGISTER_JITKERNEL(vadd, VAddKernel);
+REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
+REGISTER_JITKERNEL(vrelu, VReluKernel);
+REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
+REGISTER_JITKERNEL(videntity, VIdentityKernel);
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e481d1921a7dc4fd6da3fffbc3959eafa7b4b461
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -0,0 +1,296 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <limits>
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+/* CRF Decode JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
+ public:
+  explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
+    this->num_ = tag_num;
+  }
+  void Compute(const int seq_len, const T* x, const T* w, T* alpha,
+               int* track) const override {
+    constexpr int state_trans_base_idx = 2;
+    for (int i = 0; i < this->num_; ++i) {
+      alpha[i] = w[i] + x[i];
+    }
+    for (int k = 1; k < seq_len; ++k) {
+      for (int i = 0; i < this->num_; ++i) {
+        T max_score = -std::numeric_limits<T>::max();
+        int max_j = 0;
+        for (int j = 0; j < this->num_; ++j) {
+          T score = alpha[(k - 1) * this->num_ + j] +
+                    w[(j + state_trans_base_idx) * this->num_ + i];
+          if (score > max_score) {
+            max_score = score;
+            max_j = j;
+          }
+        }
+        alpha[k * this->num_ + i] = max_score + x[k * this->num_ + i];
+        track[k * this->num_ + i] = max_j;
+      }
+    }
+  }
+};
+
+#define INIT_ALPHA(step_size)                                               \
+  /* Setup the alpha initial value.*/                                       \
+  int i_offset = 0;                                                         \
+  int last_offset = this->rest_ - step_size;                                \
+  for (int i = 0; i <= this->end_; ++i) {                                   \
+    /* weights, input and alpha values. */                                  \
+    __m256 w_content, x_content, alpha_content;                             \
+    /* Load the relevant data into the variables from un-aligned address.*/ \
+    w_content = _mm256_loadu_ps(w + i_offset);                              \
+    x_content = _mm256_loadu_ps(x + i_offset);                              \
+    alpha_content = _mm256_add_ps(w_content, x_content);                    \
+    _mm256_storeu_ps(alpha + i_offset, alpha_content);                      \
+    i_offset += step_size;                                                  \
+    if (i == this->end_ - 1) {                                              \
+      if (this->rest_ > 0) {                                                \
+        i_offset += last_offset;                                            \
+      } else {                                                              \
+        break;                                                              \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define UPDATE_ALPHA(step_size)                                               \
+  /* Update the alpha and track values. */                                    \
+  __m256 x_content = _mm256_loadu_ps(x + seq_offset + this->num_ + j_offset); \
+  max_score = _mm256_add_ps(max_score, x_content);                            \
+  _mm256_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score);    \
+  _mm256_storeu_si256(                                                        \
+      reinterpret_cast<__m256i*>(track + seq_offset + this->num_ + j_offset), \
+      max_j);                                                                 \
+  /* Calculate the offset of next step*/                                      \
+  j_offset += step_size;                                                      \
+  if (j == this->end_ - 1) {                                                  \
+    if (this->rest_ > 0) {                                                    \
+      j_offset += last_offset;                                                \
+    } else {                                                                  \
+      break;                                                                  \
+    }                                                                         \
+  }
+
+#define INTRIAVX_FLOAT(block)                                                  \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl(            \
+      int tag_num)                                                             \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX_FLOAT_BLOCK;                                 \
+    this->rest_ = this->num_ % AVX_FLOAT_BLOCK;                                \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX_FLOAT_BLOCK)                                                \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
+        __m256i max_j = _mm256_set1_epi32(0);                                  \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
+          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
+          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
+          /* According to the mask value, update the index of the max_score.*/ \
+          /* AVX instructions.*/                                               \
+          __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
+          __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
+          __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);        \
+          __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);        \
+          lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
+          hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
+          lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \
+          hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));                 \
+          lo_max_j = _mm_or_si128(lo_mask, lo_max_j);                          \
+          hi_max_j = _mm_or_si128(hi_mask, hi_max_j);                          \
+          max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);                 \
+          max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);                 \
+          /* AVX done*/                                                        \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm256_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        UPDATE_ALPHA(AVX_FLOAT_BLOCK)                                          \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#define INTRIAVX2_FLOAT(isa, block)                                            \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num)     \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX2_FLOAT_BLOCK;                                \
+    this->rest_ = this->num_ % AVX2_FLOAT_BLOCK;                               \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, isa, block>::Compute(                        \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX2_FLOAT_BLOCK)                                               \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
+        __m256i max_j = _mm256_set1_epi32(0);                                  \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
+          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
+          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
+          /* According to the mask value, update the index of the max_score.*/ \
+          /* AVX2 instructions.*/                                              \
+          max_j = _mm256_or_si256(                                             \
+              _mm256_andnot_si256((__m256i)mask, max_j),                       \
+              _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));          \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm256_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        UPDATE_ALPHA(AVX2_FLOAT_BLOCK)                                         \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#define INTRIAVX512_FLOAT(block)                                               \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl(        \
+      int tag_num)                                                             \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX512_FLOAT_BLOCK;                              \
+    this->rest_ = this->num_ % AVX512_FLOAT_BLOCK;                             \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX512_FLOAT_BLOCK)                                             \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max()); \
+        __m512i max_j = _mm512_setzero_si512();                                \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));    \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m512 w_content = _mm512_loadu_ps(w + trans_offset);                \
+          __m512 score_v = _mm512_add_ps(alpha_content, w_content);            \
+          __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); \
+          /* AVX512 instructions.*/                                            \
+          max_j = _mm512_mask_set1_epi32(max_j, mask, i);                      \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm512_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        /* Update the alpha and track values.*/                                \
+        __m512 x_content =                                                     \
+            _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset);           \
+        max_score = _mm512_add_ps(max_score, x_content);                       \
+        _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset,           \
+                         max_score);                                           \
+        _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset +    \
+                                                       this->num_ + j_offset), \
+                            max_j);                                            \
+        /* Calculate the offset of next step*/                                 \
+        j_offset += AVX512_FLOAT_BLOCK;                                        \
+        if (j == this->end_ - 1) {                                             \
+          if (this->rest_ > 0) {                                               \
+            j_offset += last_offset;                                           \
+          } else {                                                             \
+            break;                                                             \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#ifdef __AVX__
+INTRIAVX_FLOAT(kEQ8);
+INTRIAVX_FLOAT(kGT8LT16);
+INTRIAVX_FLOAT(kEQ16);
+INTRIAVX_FLOAT(kGT16);
+#endif
+#ifdef __AVX2__
+INTRIAVX2_FLOAT(jit::avx2, kEQ8);
+INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
+INTRIAVX2_FLOAT(jit::avx2, kEQ16);
+INTRIAVX2_FLOAT(jit::avx2, kGT16);
+#endif
+#ifdef __AVX512F__
+INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
+INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
+INTRIAVX512_FLOAT(kEQ16);
+INTRIAVX512_FLOAT(kGT16);
+#endif
+
+#undef INTRIAVX512_FLOAT
+#undef INTRIAVX2_FLOAT
+#undef INTRIAVX_FLOAT
+#undef INIT_ALPHA
+#undef UPDATE_ALPHA
+
+REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel);
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4247580f491a7ca26259528ca74dd92e35785a9
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -0,0 +1,541 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <cmath>  // for exp
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace jit = platform::jit;
+
+/* VExp JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VExpKernelImpl : public VExpKernel<T> {
+ public:
+  explicit VExpKernelImpl(int d) : VExpKernel<T>() { this->num_ = d; }
+  void Compute(const T* x, T* y) const override {
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = std::exp(x[i]);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_MKLML
+#define MKL_FLOAT(isa, block)                                               \
+  template <>                                                               \
+  void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
+      const {                                                               \
+    platform::dynload::vsExp(this->num_, x, y);                             \
+  }
+
+#define MKL_DOUBLE(isa, block)                                                 \
+  template <>                                                                  \
+  void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
+      const {                                                                  \
+    platform::dynload::vdExp(this->num_, x, y);                                \
+  }
+FOR_EACH_ISA(MKL_FLOAT, kLT8);
+FOR_EACH_ISA(MKL_FLOAT, kGT8LT16);
+FOR_EACH_ISA(MKL_FLOAT, kGT16);
+FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
+#endif
+
+namespace detail {
+
+#ifdef __AVX__
+
+#define ALIGN32 __attribute__((aligned(32)))
+
+#define _PS256_CONST(Name, Val)                                      \
+  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+                                                 Val, Val, Val, Val}
+
+#define _PI256_CONST(Name, Val)                                    \
+  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+                                               Val, Val, Val, Val}
+
+_PI256_CONST(0x7f, 0x7f);
+_PS256_CONST(one, 1.f);
+_PS256_CONST(0p5, 0.5f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+typedef union imm_xmm_union {
+  __m256i imm;
+  __m128i xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
+  {                                         \
+    imm_xmm_union u ALIGN32;                \
+    u.imm = imm_;                           \
+    xmm0_ = u.xmm[0];                       \
+    xmm1_ = u.xmm[1];                       \
+  }
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
+  {                                         \
+    imm_xmm_union u ALIGN32;                \
+    u.xmm[0] = xmm0_;                       \
+    u.xmm[1] = xmm1_;                       \
+    imm_ = u.imm;                           \
+  }
+
+#define AVX2_BITOP_USING_SSE2(fn)                           \
+  static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
+    /* use SSE2 to perform the bitop AVX2 */                \
+    __m128i x1, x2;                                         \
+    __m256i ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                             \
+    x1 = _mm_##fn(x1, y);                                   \
+    x2 = _mm_##fn(x2, y);                                   \
+    COPY_XMM_TO_IMM(x1, x2, ret);                           \
+    return ret;                                             \
+  }
+
+#define AVX2_INTOP_USING_SSE2(fn)                                    \
+  static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
+    /* use SSE2 to perform the AVX2 integer operation */             \
+    __m128i x1, x2;                                                  \
+    __m128i y1, y2;                                                  \
+    __m256i ret;                                                     \
+    COPY_IMM_TO_XMM(x, x1, x2);                                      \
+    COPY_IMM_TO_XMM(y, y1, y2);                                      \
+    x1 = _mm_##fn(x1, y1);                                           \
+    x2 = _mm_##fn(x2, y2);                                           \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                    \
+    return ret;                                                      \
+  }
+
+AVX2_BITOP_USING_SSE2(slli_epi32);
+AVX2_INTOP_USING_SSE2(add_epi32);
+
+#define AVXEXP_BASE                                                            \
+  __m256 tmp = _mm256_setzero_ps(), fx;                                        \
+  __m256 one = *reinterpret_cast<const __m256*>(_ps256_one);                   \
+  __m256i imm0;                                                                \
+  x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi));       \
+  x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo));       \
+  /* express exp(x) as exp(g + n*log(2)) */                                    \
+  fx = _mm256_mul_ps(x,                                                        \
+                     *reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF));  \
+  fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5));        \
+  tmp = _mm256_floor_ps(fx);                                                   \
+  /* if greater, substract 1 */                                                \
+  __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);                            \
+  mask = _mm256_and_ps(mask, one);                                             \
+  fx = _mm256_sub_ps(tmp, mask);                                               \
+  tmp = _mm256_mul_ps(fx,                                                      \
+                      *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
+  __m256 z = _mm256_mul_ps(                                                    \
+      fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2));             \
+  x = _mm256_sub_ps(x, tmp);                                                   \
+  x = _mm256_sub_ps(x, z);                                                     \
+  z = _mm256_mul_ps(x, x);                                                     \
+  __m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0);           \
+  y = _mm256_mul_ps(y, x);                                                     \
+  y = _mm256_add_ps(y,                                                         \
+                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1));   \
+  y = _mm256_mul_ps(y, x);                                                     \
+  y = _mm256_add_ps(y,                                                         \
+                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2));   \
+  y = _mm256_mul_ps(y, x);                                                     \
+  y = _mm256_add_ps(y,                                                         \
+                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3));   \
+  y = _mm256_mul_ps(y, x);                                                     \
+  y = _mm256_add_ps(y,                                                         \
+                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4));   \
+  y = _mm256_mul_ps(y, x);                                                     \
+  y = _mm256_add_ps(y,                                                         \
+                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5));   \
+  y = _mm256_mul_ps(y, z);                                                     \
+  y = _mm256_add_ps(y, x);                                                     \
+  y = _mm256_add_ps(y, one);                                                   \
+  /* build 2^n */                                                              \
+  imm0 = _mm256_cvttps_epi32(fx)
+
+__m256 ExpAVX(__m256 x) {
+  AVXEXP_BASE;
+  // two AVX2 instructions using SSE2
+  imm0 = avx2_mm256_add_epi32(imm0,
+                              *reinterpret_cast<const __m256i*>(_pi256_0x7f));
+  imm0 = avx2_mm256_slli_epi32(imm0, 23);
+  __m256 pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+#endif
+
+#ifdef __AVX2__
+__m256 ExpAVX2(__m256 x) {
+  AVXEXP_BASE;
+  // two AVX2 instructions
+  imm0 = _mm256_add_epi32(imm0, *reinterpret_cast<const __m256i*>(_pi256_0x7f));
+  imm0 = _mm256_slli_epi32(imm0, 23);
+  __m256 pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+#endif
+
+}  // namespace detail
+
+#define INTRI8_FLOAT(isa, expisa)                                          \
+  template <>                                                              \
+  void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                              \
+    __m256 tmp = _mm256_loadu_ps(x);                                       \
+    _mm256_storeu_ps(y, expisa(tmp));                                      \
+  }
+
+#define INTRI16_FLOAT(isa, expisa)                                          \
+  template <>                                                               \
+  void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                       \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                   \
+    tmp0 = expisa(tmp0);                                                    \
+    tmp1 = expisa(tmp1);                                                    \
+    _mm256_storeu_ps(y, tmp0);                                              \
+    _mm256_storeu_ps(y + 8, tmp1);                                          \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx, detail::ExpAVX);
+INTRI16_FLOAT(jit::avx, detail::ExpAVX);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef MKL_FLOAT
+#undef MKL_DOUBLE
+
+REGISTER_JITKERNEL(vexp, VExpKernel);
+
+/* VSigmoid JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VSigmoidKernelImpl : public VSigmoidKernel<T> {
+ public:
+  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
+    this->num_ = d;
+    vexp_ = KernelPool::Instance().template Get<VExpKernel<T>>(d);
+  }
+  void Compute(const T* x, T* y) const override {
+    const T min = SIGMOID_THRESHOLD_MIN;
+    const T max = SIGMOID_THRESHOLD_MAX;
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+      y[i] = static_cast<T>(0) - y[i];
+    }
+    vexp_->Compute(y, y);
+    for (int i = 0; i < this->num_; ++i) {
+      y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+    }
+  }
+
+ private:
+  std::shared_ptr<const VExpKernel<T>> vexp_;
+};
+
+#define INTRI_SIGMOID(tmp, min, max, expisa)      \
+  tmp = _mm256_max_ps(tmp, min);                  \
+  tmp = _mm256_min_ps(tmp, max);                  \
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
+  tmp = expisa(tmp);                              \
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
+
+#define INTRI8_FLOAT(isa, expisa)                                              \
+  template <>                                                                  \
+  void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                                  \
+    /* TODO(TJ): try to use static const*/                                     \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                        \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                        \
+    __m256 tmp = _mm256_loadu_ps(x);                                           \
+    INTRI_SIGMOID(tmp, min, max, expisa);                                      \
+    _mm256_storeu_ps(y, tmp);                                                  \
+  }
+
+#define INTRI16_FLOAT(isa, expisa)                                      \
+  template <>                                                           \
+  void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x,   \
+                                                      float* y) const { \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                 \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                 \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                               \
+    INTRI_SIGMOID(tmp0, min, max, expisa);                              \
+    INTRI_SIGMOID(tmp1, min, max, expisa);                              \
+    _mm256_storeu_ps(y, tmp0);                                          \
+    _mm256_storeu_ps(y + 8, tmp1);                                      \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa, expisa)                                     \
+  template <>                                                                \
+  VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d)        \
+      : VSigmoidKernel<float>() {                                            \
+    this->num_ = d;                                                          \
+    this->end_ = AVX_FLOAT_BLOCK;                                            \
+    this->rest_ = d - this->end_;                                            \
+    vexp_ =                                                                  \
+        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
+  }                                                                          \
+  template <>                                                                \
+  void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,     \
+                                                         float* y) const {   \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
+    __m256 tmp = _mm256_loadu_ps(x);                                         \
+    INTRI_SIGMOID(tmp, min, max, expisa);                                    \
+    _mm256_storeu_ps(y, tmp);                                                \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
+      y[i] = 0.f - y[i];                                                     \
+    }                                                                        \
+    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = 1.f / (1.f + y[i]);                                             \
+    }                                                                        \
+  }
+
+#define INTRI_GT16_FLOAT(isa, expisa)                                        \
+  template <>                                                                \
+  VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d)           \
+      : VSigmoidKernel<float>() {                                            \
+    this->num_ = d;                                                          \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                       \
+    this->end_ = d - this->rest_;                                            \
+    vexp_ =                                                                  \
+        KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
+  }                                                                          \
+  template <>                                                                \
+  void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x,        \
+                                                      float* y) const {      \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);                      \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);                      \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                  \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                   \
+      INTRI_SIGMOID(tmp, min, max, expisa);                                  \
+      _mm256_storeu_ps(y + i, tmp);                                          \
+    }                                                                        \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                                \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                                \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]);           \
+      y[i] = 0.f - y[i];                                                     \
+    }                                                                        \
+    vexp_->Compute(y + this->end_, y + this->end_);                          \
+    for (int i = this->end_; i < this->num_; ++i) {                          \
+      y[i] = 1.f / (1.f + y[i]);                                             \
+    }                                                                        \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx, detail::ExpAVX);
+INTRI16_FLOAT(jit::avx, detail::ExpAVX);
+INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
+INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
+// maybe use avx at gt8lt16 and gt16
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
+// maybe use avx2 at gt8lt16 and gt16
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+#undef INTRI_VSIGMOID
+
+REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
+
+/* VTanh JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class VTanhKernelImpl : public VTanhKernel<T> {
+ public:
+  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
+    this->num_ = d;
+    vscal_ = KernelPool::Instance().template Get<VScalKernel<T>>(d);
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<T>>(d);
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<T>>(d);
+  }
+  void Compute(const T* x, T* y) const override {
+    vscal_->Compute(static_cast<T>(2), x, y);
+    vsigmoid_->Compute(y, y);
+    vscal_->Compute(static_cast<T>(2), y);
+    vaddbias_->Compute(static_cast<T>(-1), y, y);
+  }
+
+ private:
+  std::shared_ptr<const VScalKernel<T>> vscal_;
+  std::shared_ptr<const VSigmoidKernel<T>> vsigmoid_;
+  std::shared_ptr<const VAddBiasKernel<T>> vaddbias_;
+};
+
+#define INTRI_VTANH(tmp, expisa)                           \
+  tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp);         \
+  tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
+  tmp = expisa(tmp);                                       \
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);          \
+  tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp);          \
+  tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
+
+#define INTRI8_FLOAT(isa, expisa)                                           \
+  template <>                                                               \
+  void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
+      const {                                                               \
+    __m256 tmp = _mm256_loadu_ps(x);                                        \
+    INTRI_VTANH(tmp, expisa);                                               \
+    _mm256_storeu_ps(y, tmp);                                               \
+  }
+
+#define INTRI16_FLOAT(isa, expisa)                                           \
+  template <>                                                                \
+  void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
+      const {                                                                \
+    __m256 tmp0 = _mm256_loadu_ps(x);                                        \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                                    \
+    INTRI_VTANH(tmp0, expisa);                                               \
+    INTRI_VTANH(tmp1, expisa);                                               \
+    _mm256_storeu_ps(y, tmp0);                                               \
+    _mm256_storeu_ps(y + 8, tmp1);                                           \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa, expisa)                                      \
+  template <>                                                                 \
+  VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d)               \
+      : VTanhKernel<float>() {                                                \
+    this->num_ = d;                                                           \
+    this->end_ = AVX_FLOAT_BLOCK;                                             \
+    this->rest_ = d - this->end_;                                             \
+    vscal_ =                                                                  \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
+        this->rest_);                                                         \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
+        this->rest_);                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x,         \
+                                                      float* y) const {       \
+    __m256 tmp = _mm256_loadu_ps(x);                                          \
+    INTRI_VTANH(tmp, expisa);                                                 \
+    _mm256_storeu_ps(y, tmp);                                                 \
+    x += AVX_FLOAT_BLOCK;                                                     \
+    y += AVX_FLOAT_BLOCK;                                                     \
+    vscal_->Compute(2.f, x, y);                                               \
+    vsigmoid_->Compute(y, y);                                                 \
+    vscal_->Compute(2.f, y);                                                  \
+    vaddbias_->Compute(-1.f, y, y);                                           \
+  }
+
+#define INTRI_GT16_FLOAT(isa, expisa)                                         \
+  template <>                                                                 \
+  VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d)                  \
+      : VTanhKernel<float>() {                                                \
+    this->num_ = d;                                                           \
+    this->rest_ = d % AVX_FLOAT_BLOCK;                                        \
+    this->end_ = d - this->rest_;                                             \
+    vscal_ =                                                                  \
+        KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
+    vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>(   \
+        this->rest_);                                                         \
+    vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>(   \
+        this->rest_);                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y)  \
+      const {                                                                 \
+    for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) {                   \
+      __m256 tmp = _mm256_loadu_ps(x + i);                                    \
+      INTRI_VTANH(tmp, expisa);                                               \
+      _mm256_storeu_ps(y + i, tmp);                                           \
+    }                                                                         \
+    x += this->end_;                                                          \
+    y += this->end_;                                                          \
+    vscal_->Compute(2.f, x, y);                                               \
+    vsigmoid_->Compute(y, y);                                                 \
+    vscal_->Compute(2.f, y);                                                  \
+    vaddbias_->Compute(-1.f, y, y);                                           \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx, detail::ExpAVX);
+INTRI16_FLOAT(jit::avx, detail::ExpAVX);
+INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
+INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
+// maybe use avx at gt8lt16 and gt16
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
+INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
+// maybe use avx at gt8lt16 and gt16
+#endif
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+#undef INTRI_VTANH
+
+REGISTER_JITKERNEL(vtanh, VTanhKernel);
+
+#undef JITKERNEL_NEW_ACT_IMPL
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8e55f2673560ff6afa34376b73275b57a8ceea1
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+#define SEARCH_BLOCK(macro_, ker, dtype, isa)                 \
+  if (d < AVX_FLOAT_BLOCK) {                                  \
+    macro_(ker, dtype, isa, kLT8);                            \
+  } else if (d == AVX_FLOAT_BLOCK) {                          \
+    macro_(ker, dtype, isa, kEQ8);                            \
+  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
+    macro_(ker, dtype, isa, kGT8LT16);                        \
+  } else if (d == AVX512_FLOAT_BLOCK) {                       \
+    macro_(ker, dtype, isa, kEQ16);                           \
+  } else {                                                    \
+    macro_(ker, dtype, isa, kGT16);                           \
+  }
+
+#define SEARCH_ISA_BLOCK(macro_, ker, dtype)        \
+  if (jit::MayIUse(jit::avx512f)) {                 \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
+  } else if (jit::MayIUse(jit::avx2)) {             \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx2);    \
+  } else if (jit::MayIUse(jit::avx)) {              \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::avx);     \
+  } else {                                          \
+    SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
+  }
+
+#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
+  template <>                                   \
+  std::shared_ptr<const ker_class<ker_dtype>>   \
+  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
+
+#define JITKERNEL_KEY(ker_key, dtype_key) \
+  #ker_key #dtype_key + std::to_string(d)
+
+#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
+  p = std::dynamic_pointer_cast<ker<dtype>>(   \
+      std::make_shared<ker##Impl<dtype, isa, k>>(d))
+
+#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
+                             marco_declare, macro_key, macro_impl)     \
+  marco_declare(ker_class, ker_dtype) {                                \
+    std::string key = macro_key(ker_key, dtype_key);                   \
+    if (kers_.find(key) == kers_.end()) {                              \
+      std::shared_ptr<ker_class<ker_dtype>> p;                         \
+      SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype);              \
+      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});       \
+      return p;                                                        \
+    }                                                                  \
+    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(      \
+        kers_.at(key));                                                \
+  }
+
+#define REGISTER_JITKERNEL(ker_key, ker_class)                           \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE,  \
+                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL);               \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
+                       JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
+
+#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key,  \
+                                macro_impl)                                    \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
+                       macro_impl);                                            \
+  JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare,           \
+                       macro_key, macro_impl)
+
+#define FOR_EACH_ISA(macro_, block) \
+  macro_(jit::avx512f, block);      \
+  macro_(jit::avx2, block);         \
+  macro_(jit::avx, block);          \
+  macro_(jit::isa_any, block)
+
+#define FOR_EACH_BLOCK(macro_, isa) \
+  macro_(isa, kLT8);                \
+  macro_(isa, kEQ8);                \
+  macro_(isa, kGT8LT16);            \
+  macro_(isa, kEQ16);               \
+  macro_(isa, kGT16)
+
+#define FOR_EACH_ISA_BLOCK(macro_)      \
+  FOR_EACH_BLOCK(macro_, jit::avx512f); \
+  FOR_EACH_BLOCK(macro_, jit::avx2);    \
+  FOR_EACH_BLOCK(macro_, jit::avx);     \
+  FOR_EACH_BLOCK(macro_, jit::isa_any)
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fab293f7d03eb923995fa4cd99af955a34faa6a4
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -0,0 +1,485 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace detail {
+#ifdef __AVX__
+__m256 ExpAVX(__m256 x);
+#endif
+
+#ifdef __AVX2__
+__m256 ExpAVX2(__m256 x);
+#endif
+
+}  // namespace detail
+
+namespace jit = platform::jit;
+
+#ifdef __AVX__
+typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;
+
+class AVXAct {
+ public:
+  virtual ~AVXAct() = default;
+  virtual __m256 Compute(__m256 x) const = 0;
+};
+
+template <act_type type, jit::cpu_isa_t isa>
+class AVXActImpl : public AVXAct {
+ public:
+  __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
+};
+
+#define AVX_SIGMOID(isa, expisa)                                 \
+  template <>                                                    \
+  __m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const {    \
+    __m256 ones = _mm256_set1_ps(1.0f);                          \
+    x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
+    x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
+    x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);                  \
+    x = expisa(x);                                               \
+    x = _mm256_add_ps(ones, x);                                  \
+    return _mm256_div_ps(ones, x);                               \
+  }
+
+#define AVX_TANH(isa, expisa)                              \
+  template <>                                              \
+  __m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
+    __m256 ones = _mm256_set1_ps(1.0f);                    \
+    x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);           \
+    x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));   \
+    x = expisa(x);                                         \
+    x = _mm256_add_ps(ones, x);                            \
+    x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);            \
+    return _mm256_sub_ps(x, ones);                         \
+  }
+
+#define AVX_RELU(isa)                                      \
+  template <>                                              \
+  __m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
+    return _mm256_max_ps(x, _mm256_setzero_ps());          \
+  }
+
+#define AVX_IDENTITY(isa)                                      \
+  template <>                                                  \
+  __m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
+    return x;                                                  \
+  }
+
+#define FOR_EACH_AVX_ISA(macro_) \
+  macro_(jit::avx);              \
+  macro_(jit::avx2);             \
+  macro_(jit::avx512f)
+
+FOR_EACH_AVX_ISA(AVX_RELU);
+FOR_EACH_AVX_ISA(AVX_IDENTITY);
+
+AVX_SIGMOID(jit::avx, detail::ExpAVX);
+AVX_TANH(jit::avx, detail::ExpAVX);
+
+#ifdef __AVX2__
+AVX_SIGMOID(jit::avx2, detail::ExpAVX2);
+AVX_SIGMOID(jit::avx512f, detail::ExpAVX2);
+AVX_TANH(jit::avx2, detail::ExpAVX2);
+AVX_TANH(jit::avx512f, detail::ExpAVX2);
+#endif
+
+#undef FOR_EACH_AVX_ISA
+#undef AVX_IDENTITY
+#undef AVX_RELU
+#undef AVX_TANH
+#undef AVX_SIGMOID
+
+#endif
+
+template <typename T>
+static std::shared_ptr<const VActKernel<T>> GetActKernel(
+    const std::string& type, int n) {
+  if (type == "sigmoid") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
+  } else if (type == "relu") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VReluKernel<T>>(n));
+  } else if (type == "tanh") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
+  } else if (type == "identity" || type == "") {
+    return std::dynamic_pointer_cast<const VActKernel<T>>(
+        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+#ifdef __AVX__
+template <jit::cpu_isa_t isa>
+static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
+  if (type == "sigmoid") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());
+  } else if (type == "relu") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());
+  } else if (type == "tanh") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());
+  } else if (type == "identity" || type == "") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+#endif
+
+/* LSTM JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class LSTMKernelImpl : public LSTMKernel<T> {
+ public:
+  explicit LSTMKernelImpl(const std::string& act_gate,
+                          const std::string& act_cand,
+                          const std::string& act_cell, int d)
+      : LSTMKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    d3_ = d * 3;
+    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+  }
+
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
+                   T* checked) const override {
+    // gates: W_ch, W_ih, W_fh, W_oh
+    act_gate_d3_->Compute(gates + d_, gates + d_);
+
+    /* C_t = C_t-1 * fgated + cand_gated * igated */
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, gates + d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+
+    /* H_t = act_cell(C_t) * ogated */
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+ private:
+  int d_, d2_, d3_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+  std::shared_ptr<const VAddKernel<T>> vadd_d_;
+#ifdef __AVX__
+  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
+#endif
+};
+
+#define INTRI8_FLOAT(isa)                                                    \
+  template <>                                                                \
+  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                          \
+      const std::string& act_gate, const std::string& act_cand,              \
+      const std::string& act_cell, int d)                                    \
+      : LSTMKernel<float>() {                                                \
+    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                \
+    avx_act_cand_ = GetAVXAct<isa>(act_cand);                                \
+    avx_act_cell_ = GetAVXAct<isa>(act_cell);                                \
+  }                                                                          \
+  template <>                                                                \
+  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
+      float* gates, const float* ct_1, float* ct, float* ht,                 \
+      const float* wp_data, float* checked) const {                          \
+    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
+    __m256 c, i, f, o;                                                       \
+    c = _mm256_loadu_ps(gates);                                              \
+    i = _mm256_loadu_ps(gates + 8);                                          \
+    f = _mm256_loadu_ps(gates + 16);                                         \
+    o = _mm256_loadu_ps(gates + 24);                                         \
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
+    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
+    i = _mm256_loadu_ps(ct_1);                                               \
+    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
+    f = _mm256_add_ps(c, f);                                                 \
+    _mm256_storeu_ps(ct, f);                                                 \
+    /* H_t = act_cell(C_t) * ogated */                                       \
+    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
+    _mm256_storeu_ps(ht, o);                                                 \
+  }                                                                          \
+  template <>                                                                \
+  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                        \
+      float* gates, float* ct, float* ht, const float* wp_data) const {      \
+    __m256 c, i, o;                                                          \
+    c = _mm256_loadu_ps(gates);                                              \
+    i = _mm256_loadu_ps(gates + 8);                                          \
+    o = _mm256_loadu_ps(gates + 24);                                         \
+    /* C_t = igated * cgated*/                                               \
+    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
+    _mm256_storeu_ps(ct, c);                                                 \
+    /* H_t = act_cell(C_t) * ogated */                                       \
+    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
+    _mm256_storeu_ps(ht, o);                                                 \
+  }
+
+// TODO(TJ): optimize keq16
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+
+/* Peephole JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class PeepholeKernelImpl : public LSTMKernel<T> {
+ public:
+  explicit PeepholeKernelImpl(const std::string& act_gate,
+                              const std::string& act_cand,
+                              const std::string& act_cell, int d)
+      : LSTMKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    d3_ = d * 3;
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_cand_d_ = GetActKernel<T>(act_cand, d);
+    act_cell_d_ = GetActKernel<T>(act_cell, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
+    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+  }
+
+  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
+                   T* checked) const override {
+    /* get fgated and igated*/
+    vmul_d_->Compute(wp_data, ct_1, checked);
+    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_);
+    vadd_d2_->Compute(checked, gates + d_, gates + d_);
+    act_gate_d2_->Compute(gates + d_, gates + d_);
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, gates + d_);
+    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_);
+    vadd_d_->Compute(gates + d_, gates + d2_, ct);
+    /* get ogated*/
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    /* H_t = act_cell(C_t) * ogated */
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
+    /* C_t = igated * cgated*/
+    act_gate_d_->Compute(gates + d_, gates + d_);
+    act_cand_d_->Compute(gates, gates);
+    vmul_d_->Compute(gates, gates + d_, ct);
+    /* get outgated, put W_oc * C_t on igated */
+    vmul_d_->Compute(wp_data + d2_, ct, gates + d_);
+    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_);
+    /* H_t = act_cell(C_t) * ogated */
+    act_gate_d_->Compute(gates + d3_, gates + d3_);
+    act_cell_d_->Compute(ct, gates + d2_);
+    vmul_d_->Compute(gates + d2_, gates + d3_, ht);
+  }
+
+ private:
+  int d_, d2_, d3_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
+      act_cell_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
+};
+
+#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)                  \
+  template <>                                                         \
+  std::shared_ptr<const LSTMKernel<ker_dtype>>                        \
+  KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&,          \
+                  const std::string&, const std::string&, int, bool>( \
+      const std::string& act_gate, const std::string& act_cand,       \
+      const std::string& act_cell, int d, bool use_peephole)
+
+#define JITKERNEL_KEY_LSTM(ker_key, dtype_key)                               \
+  #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
+                                       (use_peephole ? "p" : "n")
+
+#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k)                    \
+  if (use_peephole) {                                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
+        std::make_shared<PeepholeKernelImpl<dtype, isa, k>>(           \
+            act_gate, act_cand, act_cell, d));                         \
+  } else {                                                             \
+    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
+        std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
+                                                   act_cell, d));      \
+  }
+
+REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
+                        JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
+
+#undef INTRI8_FLOAT
+#undef JITKERNEL_DECLARE_LSTM
+#undef JITKERNEL_KEY_LSTM
+#undef JITKERNEL_NEW_LSTM_IMPL
+
+/* GRU JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class GRUKernelImpl : public GRUKernel<T> {
+ public:
+  explicit GRUKernelImpl(const std::string& act_gate,
+                         const std::string& act_state, int d)
+      : GRUKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_state_d_ = GetActKernel<T>(act_state, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+  }
+
+  void ComputeH1(T* gates, T* ht) const override {
+    act_gate_d_->Compute(gates, gates);
+    act_state_d_->Compute(gates + d2_, gates + d2_);
+    vmul_d_->Compute(gates, gates + d2_, ht);
+  }
+
+  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
+    // W: {W_update, W_reset; W_state}
+    act_gate_d2_->Compute(gates, gates);
+    vmul_d_->Compute(ht_1, gates + d_, ht);
+  }
+
+  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
+    T* y = gates + d2_;
+    act_state_d_->Compute(y, y);
+    // out = zt*ht~ + (1-zt)*ht_1
+    for (int i = 0; i < d_; ++i) {
+      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+    }
+  }
+
+ private:
+  int d_, d2_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_state_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+#ifdef __AVX__
+  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_state_;
+#endif
+};
+
+#define INTRI8_FLOAT(isa)                                                     \
+  template <>                                                                 \
+  GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl(                             \
+      const std::string& act_gate, const std::string& act_state, int d)       \
+      : GRUKernel<float>() {                                                  \
+    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                 \
+    avx_act_state_ = GetAVXAct<isa>(act_state);                               \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht)    \
+      const {                                                                 \
+    __m256 u, s;                                                              \
+    /* W: {W_update, W_reset; W_state} */                                     \
+    u = _mm256_loadu_ps(gates);                                               \
+    s = _mm256_loadu_ps(gates + 16);                                          \
+    s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
+    _mm256_storeu_ps(ht, s);                                                  \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1(                       \
+      float* gates, const float* ht_1, float* ht) const {                     \
+    /* not exactly equal the any implementation */                            \
+    __m256 r, ht0;                                                            \
+    r = _mm256_loadu_ps(gates + 8);                                           \
+    ht0 = _mm256_loadu_ps(ht_1);                                              \
+    r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0);                        \
+    _mm256_storeu_ps(ht, r);                                                  \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2(                       \
+      float* gates, const float* ht_1, float* ht) const {                     \
+    /* not exactly equal the any implementation */                            \
+    __m256 u, s, ht0;                                                         \
+    u = _mm256_loadu_ps(gates);                                               \
+    s = _mm256_loadu_ps(gates + 16);                                          \
+    ht0 = _mm256_loadu_ps(ht_1);                                              \
+    u = avx_act_gate_->Compute(u);                                            \
+    s = _mm256_mul_ps(u, avx_act_state_->Compute(s));                         \
+    u = _mm256_sub_ps(_mm256_set1_ps(1.f), u);                                \
+    u = _mm256_mul_ps(u, ht0);                                                \
+    u = _mm256_add_ps(s, u);                                                  \
+    _mm256_storeu_ps(ht, u);                                                  \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+
+#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)                       \
+  template <>                                                             \
+  std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get<            \
+      GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
+      const std::string& act_gate, const std::string& act_state, int d)
+
+#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
+  #ker_key #dtype_key + std::to_string(d) + act_gate + act_state
+
+#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
+  p = std::dynamic_pointer_cast<ker<dtype>>(       \
+      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
+
+REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
+                        JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);
+
+#undef INTRI8_FLOAT
+#undef JITKERNEL_NEW_GRU_IMPL
+#undef JITKERNEL_KEY_GRU
+#undef JITKERNEL_DECLARE_GRU
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9e6ab740da4e39de3dc41f9df352b88e696c38d
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -0,0 +1,807 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <sys/time.h>
+#include <cmath>    // for exp
+#include <cstring>  // for memcpy
+#include <random>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+constexpr int repeat = 20000;
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+template <typename T>
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
+               const T upper = static_cast<T>(20.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+void vrelu_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0.f ? x[i] : 0.f;
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vrelu_intri8(const int n, const float* x, float* y) {
+  __m256 tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_max_ps(tmp, _mm256_setzero_ps());
+  _mm256_storeu_ps(y, tmp);
+}
+#endif
+
+TEST(JitKernel, vrelu) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -10.f, 1.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vrelu_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vrelu_intri8(d, x_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vaddbias_ref(const int n, const float a, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+TEST(JitKernel, vaddbias) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
+    const float a = 2.f;
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vaddbias_ref(d, a, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vexp_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+void vexp_mkl(const int n, const float* x, float* y) {
+  paddle::platform::dynload::vsExp(n, x, y);
+}
+#endif
+
+TEST(JitKernel, vexp) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vexp_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vexp_mkl(d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+inline float _sigmoid(float x) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  float tmp = (x < min) ? min : ((x > max) ? max : x);
+  return 1.f / (1.f + std::exp(-tmp));
+}
+
+void vsigmoid_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+void vsigmoid_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
+    const int n, const float* x, float* y) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 0.f - y[i];
+  }
+  vexp->Compute(y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+}
+
+TEST(JitKernel, vsigmoid) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
+    const auto& vexp =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_better(vexp, d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
+
+void vtanh_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _tanh(x[i]);
+  }
+}
+
+void vtanh_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VAddBiasKernel<float>>&
+        vaddbias,
+    const int n, const float* x, float* y) {
+  vscal->Compute(2.f, x, y);
+  vsigmoid->Compute(y, y);
+  vscal->Compute(2.f, y);
+  vaddbias->Compute(-1.f, y, y);
+}
+
+TEST(JitKernel, vtanh) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
+    const auto& vscal =
+        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
+    const auto& vsigmoid =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
+    const auto& vaddbias =
+        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vtanh_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void lstm_ctht_ref(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid_3d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
+    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
+  vsigmoid_3d->Compute(gates + d, gates + d);
+  vtanh_d->Compute(gates, gates);
+  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int k = 0; k < d; ++k) {
+    // C_t = C_t-1 * fgated + cand_gated * igated
+    ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
+    // H_t = act_cell(C_t) * ogated
+    float tmp = ct[k] * 2;
+    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
+    vexp_1->Compute(&tmp, &tmp);
+    tmp = 2.f / (1.f + tmp) - 1.f;
+    ht[k] = tmp * o[k];
+  }
+}
+
+void lstm_ctht_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
+        vsigmoid_3d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VMulKernel<float>>& vmul_d,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd_d,
+    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
+  int d2 = d * 2;
+  vsigmoid_3d->Compute(gates + d, gates + d);
+  vtanh_d->Compute(gates, gates);
+  vmul_d->Compute(gates, gates + d, gates + d);
+  vmul_d->Compute(ct_1, gates + d2, gates + d2);
+  vadd_d->Compute(gates + d, gates + d2, ct);
+  /* H_t = act_cell(C_t) * ogated */
+  vtanh_d->Compute(ct, gates + d2);
+  vmul_d->Compute(gates + d2, gates + d * 3, ht);
+}
+
+TEST(JitKernel, lstm) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
+    int d4 = d * 4;
+    int d3 = d * 3;
+    std::vector<float> x(d4), xref(d4);
+    std::vector<float> ct_1(d), ct_tgt(d), ht_tgt(d);
+    std::vector<float> ct_ref(d), ht_ref(d);
+    RandomVec<float>(d4, x.data(), -2.f, 2.f);
+    RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
+    memcpy(xref.data(), x.data(), sizeof(float) * d4);
+    std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+    const auto& ker =
+        jit::KernelPool::Instance()
+            .template Get<jit::LSTMKernel<float>, const std::string&,
+                          const std::string&, const std::string&>(
+                act_gate, act_cand, act_cell, d, false);
+    // below kernels are used to compute refer
+    const auto& vsigmoid_3d =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
+            d3);
+    const auto& vtanh_d =
+        jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
+    const auto& vexp_1 =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
+    const auto& vmul_d =
+        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
+    const auto& vadd_d =
+        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
+
+    float* x_data = x.data();
+    float* xref_data = xref.data();
+    const float* ct_1_data = ct_1.data();
+    float* ct_tgt_data = ct_tgt.data();
+    float* ht_tgt_data = ht_tgt.data();
+    float* ct_ref_data = ct_ref.data();
+    float* ht_ref_data = ht_ref.data();
+    // compute once to check correctness
+    lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
+                  ct_ref_data, ht_ref_data);
+    ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
+      EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
+    }
+
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data,
+                       ct_1_data, ct_ref_data, ht_ref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
+                    ct_ref_data, ht_ref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+  }
+}
+
+void vscal_ref(const int n, const float a, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+void vscal_inp_ref(const int n, const float a, float* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+#if defined __AVX__ || defined __AVX2__
+void vscal_intri8(const int n, const float a, const float* x, float* y) {
+  __m256 tmp;
+  __m256 scalar = _mm256_set1_ps(a);
+  tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_mul_ps(tmp, scalar);
+  _mm256_storeu_ps(y, tmp);
+}
+void vscal_inp_intri8(const int n, const float a, float* x) {
+  __m256 tmp;
+  __m256 scalar = _mm256_set1_ps(a);
+  tmp = _mm256_loadu_ps(x);
+  tmp = _mm256_mul_ps(tmp, scalar);
+  _mm256_storeu_ps(x, tmp);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vscal_inp_mkl(const int n, const float a, float* x) {
+  paddle::platform::dynload::cblas_sscal(n, a, x, 1);
+}
+#endif
+
+TEST(JitKernel, vscal) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    std::memcpy(y.data(), x.data(), sizeof(float) * d);
+    float a = 2.f;
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VScalKernel<float>>(d);
+    const float* x_data = x.data();
+    float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_ref(d, a, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto trefs1 = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_inp_ref(d, a, y_data);
+    }
+    auto trefe1 = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vscal_inp_mkl(d, a, y_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vscal_intri8(d, a, x_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      auto si2 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vscal_inp_intri8(d, a, y_data);
+      }
+      auto si3 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
+              << " us, inplace: " << (si3 - si2) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    auto ttgts1 = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(a, y_data);
+    }
+    auto ttgte1 = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, inplace takes: " << (trefe1 - trefs1) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat
+            << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vmul_ref(const int n, const float* x, const float* y, float* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vmul_intri8(const int n, const float* x, const float* y, float* z) {
+  __m256 tmpx, tmpy;
+  tmpx = _mm256_loadu_ps(x);
+  tmpy = _mm256_loadu_ps(y);
+  tmpx = _mm256_mul_ps(tmpx, tmpy);
+  _mm256_storeu_ps(z, tmpx);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vmul_mkl(const int n, const float* x, const float* y, float* z) {
+  paddle::platform::dynload::vsMul(n, x, y, z);
+}
+#endif
+
+TEST(JitKernel, vmul) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    RandomVec<float>(d, y.data());
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
+    const float* x_data = x.data();
+    const float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vmul_ref(d, x_data, y_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vmul_mkl(d, x_data, y_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vmul_intri8(d, x_data, y_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, y_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vadd_ref(const int n, const float* x, const float* y, float* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+#if defined __AVX__ || defined __AVX2__
+void vadd_intri8(const int n, const float* x, const float* y, float* z) {
+  __m256 tmpx, tmpy;
+  tmpx = _mm256_loadu_ps(x);
+  tmpy = _mm256_loadu_ps(y);
+  tmpx = _mm256_add_ps(tmpx, tmpy);
+  _mm256_storeu_ps(z, tmpx);
+}
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+void vadd_mkl(const int n, const float* x, const float* y, float* z) {
+  paddle::platform::dynload::vsAdd(n, x, y, z);
+}
+#endif
+
+TEST(JitKernel, vadd) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    RandomVec<float>(d, y.data());
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
+    const float* x_data = x.data();
+    const float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vadd_ref(d, x_data, y_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+
+#ifdef PADDLE_WITH_MKLML
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vadd_mkl(d, x_data, y_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+#endif
+
+#if defined __AVX__ || defined __AVX2__
+    if (d == 8) {
+      auto si0 = GetCurrentUS();
+      for (int i = 0; i < repeat; ++i) {
+        vadd_intri8(d, x_data, y_data, zref_data);
+      }
+      auto si1 = GetCurrentUS();
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+    }
+#endif
+
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, y_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+#ifdef PADDLE_WITH_MKLML
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+#else
+            << " us, "
+#endif
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+void vaddrelu_ref(const int n, const float* x, const float* y, float* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+void vaddrelu_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
+    const float* x, const float* y, float* z) {
+  vadd->Compute(x, y, z);
+  vrelu->Compute(z, z);
+}
+
+TEST(JitKernel, vaddrelu) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+    std::vector<float> x(d), y(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data());
+    RandomVec<float>(d, y.data());
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddReluKernel<float>>(d);
+    const auto& vadd =
+        jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
+    const auto& vrelu =
+        jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
+    const float* x_data = x.data();
+    const float* y_data = y.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vadd_ref(d, x_data, y_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(x_data, y_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
+            << "tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
+TEST(JitKernel, pool) {
+  namespace jit = paddle::operators::math::jitkernel;
+  const int frame_size = 4;
+  std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+  const auto& plstm1 =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, false);
+  const auto& plstm2 =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, false);
+  const auto& peephole =
+      jit::KernelPool::Instance()
+          .template Get<jit::LSTMKernel<float>, const std::string&,
+                        const std::string&, const std::string&>(
+              act_gate, act_cand, act_cell, frame_size, true);
+  EXPECT_TRUE(plstm1 != peephole);
+
+  const auto& pvmul_f =
+      jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(4);
+  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(plstm2) !=
+              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f));
+
+  const auto& pvmul_d =
+      jit::KernelPool::Instance().template Get<jit::VMulKernel<double>>(4);
+  EXPECT_TRUE(std::dynamic_pointer_cast<const jit::Kernel>(pvmul_f) !=
+              std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
+
+  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4");
+  EXPECT_EQ(pvmul_f, pvmul_from_key);
+  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5");
+  EXPECT_TRUE(pvmul_from_key2 == nullptr);
+}
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 6810c24227ed074d4ba747399348aa12f55467a0..75946740375d74043960b68e94eb048b3bab4b79 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
-#include <vector>
+#include <unordered_map>
 
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
@@ -228,8 +229,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
 
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
+                   size_t data_len, const T* in, T* out) {
+  blas->AXPY(data_len, 1., in, out);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
+                   size_t data_len, const T* in, T* out) {
+  for (int64_t i = 0; i < data_len; i++) {
+    out[i] += in[i];
+  }
 }
 
 template <typename T>
@@ -244,24 +261,52 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
+    std::vector<const framework::SelectedRows*> inputs;
+    inputs.push_back(&input);
+    (*this)(context, inputs, output);
+  }
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const framework::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
     framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
+    std::set<int64_t> merged_row_set;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
       }
+      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
+                        "all input should have same "
+                        "dimension except for the first one");
+      PADDLE_ENFORCE_EQ(input_height, input->height(),
+                        "all input should have same height");
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+    std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                    merged_row_set.end());
+    std::unordered_map<int64_t, size_t> rows_to_id;
+    for (size_t i = 0; i < merge_rows.size(); ++i) {
+      rows_to_id[merge_rows[i]] = i;
     }
-
-    auto input_width = input.value().dims()[1];
     out.set_rows(merge_rows);
-    out.set_height(input.height());
+    out.set_height(input_height);
     out.mutable_value()->mutable_data<T>(
         framework::make_ddim(
             {static_cast<int64_t>(merge_rows.size()), input_width}),
@@ -271,12 +316,20 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     constant_functor(context, out.mutable_value(), 0.0);
 
     auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
 
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      for (int64_t j = 0; j < input_width; j++) {
-        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      auto* input_data = input->value().data<T>();
+      auto& input_rows = input->rows();
+
+      for (size_t i = 0; i < input_rows.size(); i++) {
+        size_t out_i = rows_to_id[input_rows[i]];
+        elementwise_add_to<platform::CPUDeviceContext, T>(
+            context, &blas, static_cast<size_t>(input_width),
+            &input_data[i * input_width], &out_data[out_i * input_width]);
       }
     }
   }
@@ -284,6 +337,8 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
 template struct MergeAdd<platform::CPUDeviceContext, int>;
 template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
 
 template <typename T>
 struct UpdateToTensor<platform::CPUDeviceContext, T> {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index ba8eccf82042b679f69a32f9d053f05ac8fb9a99..10f39822b9c904ce236a1a2a3806d70693bd2e63 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -267,10 +267,15 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
     framework::Vector<int64_t> input_rows(input.rows());
+    if (input_rows.size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows& out = *output;
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+    std::vector<int64_t> merge_rows_cpu(row_set.begin(), row_set.end());
+    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
 
     auto input_width = input.value().dims()[1];
 
@@ -296,6 +301,73 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         out.mutable_rows()->CUDAMutableData(context.GetPlace()),
         out.rows().size(), input_width);
   }
+
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const framework::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    framework::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
+                        "all input should have same "
+                        "dimension except for the first one");
+      PADDLE_ENFORCE_EQ(input_height, input->height(),
+                        "all input should have same height");
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+    std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
+                                        merged_row_set.end());
+    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
+
+    out.set_rows(merge_rows);
+    out.set_height(input_height);
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      auto* input_data = input->value().data<T>();
+      auto& input_rows = input->rows();
+      dim3 grid1(input_rows.size(), 1);
+
+      MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
+          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
+          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
+          out.rows().size(), input_width);
+    }
+  }
 };
 
 template struct MergeAdd<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 3d99c9b3f2303c941a150f77516703b8103c6c25..521c53dd0d71707c13c4364c5ee59943a03d4a2d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <map>
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
@@ -82,108 +83,9 @@ struct MergeAdd {
   void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output);
-};
-
-template <>
-struct MergeAdd<platform::CPUDeviceContext, float> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
-      }
-    }
-
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
-    out.set_height(input.height());
-    out.mutable_value()->mutable_data<float>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, float> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<float>();
-    auto* input_data = input.value().data<float>();
-
-    auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      float* y = out_data + out_i * input_width;
-      const float* x = input_data + i * input_width;
-      blas.AXPY(input_width, 1., x, y);
-    }
-  }
-};
-
-template <>
-struct MergeAdd<platform::CPUDeviceContext, double> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
-    auto input_rows = input.rows();
-    std::vector<int64_t> merge_rows;
-    merge_rows.reserve(input_rows.size());
-    std::unordered_map<int64_t, size_t> rows_pos_map;
-    rows_pos_map.reserve(input_rows.size());
-    size_t idx = 0u;
-    for (std::vector<int64_t>::iterator iter = input_rows.begin();
-         iter != input_rows.end(); ++iter) {
-      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
-        rows_pos_map[*iter] = idx++;
-        merge_rows.emplace_back(*iter);
-      }
-    }
-
-    auto input_width = input.value().dims()[1];
-    out.set_rows(merge_rows);
-    out.set_height(input.height());
-    out.mutable_value()->mutable_data<double>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, double> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
-
-    auto* out_data = out.mutable_value()->data<double>();
-    auto* input_data = input.value().data<double>();
-
-    auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_pos_map[input_rows[i]];
-      double* y = out_data + out_i * input_width;
-      const double* x = input_data + i * input_width;
-      blas.AXPY(input_width, 1., x, y);
-    }
-  }
+  void operator()(const DeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 835589356042b44c9fa5988aed726434fd66910a..f15b37a1e3f0ae9c7612c4f74470472393ff4ad6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -302,6 +302,64 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
   EXPECT_EQ(out_data[1 * row_numel], 2);
   EXPECT_EQ(out_data[2 * row_numel], 1);
 }
+
+TEST(selected_rows_functor, cpu_merge_add_multi) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in2_value, 1.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({3, row_numel}));
+
+  std::vector<int64_t> ret_rows{2, 3, 5};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output->value().data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    for (size_t j = 0; j < row_numel; ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
+    }
+  }
+}
+
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
@@ -318,6 +376,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       paddle::framework::make_ddim(
           {static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
+
   functor(ctx, in1_value, 1.0);
   std::vector<int64_t> rows2{0, 5, 7, 9};
   std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
@@ -327,6 +386,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       paddle::framework::make_ddim(
           {static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
+
   functor(ctx, in2_value, 2.0);
   std::unique_ptr<paddle::framework::SelectedRows> output{
       new paddle::framework::SelectedRows()};
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu
index 5fc50aba25d8e69480a17f0f80877b0d03e17276..17af3e3999ca688c584f636f4c00386f886f9bbf 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -241,3 +241,67 @@ TEST(selected_rows_functor, gpu_add_to) {
   // row9: 2.0 + 3.0
   EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
 }
+
+TEST(selected_rows_functor, gpu_merge_add) {
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDADeviceContext& ctx =
+      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      gpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      gpu_place);
+  set_const(ctx, in2_value, 1.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<
+      paddle::platform::CUDADeviceContext, float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  paddle::framework::Tensor output_cpu;
+  paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
+  ctx.Wait();
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({3, row_numel}));
+
+  std::vector<int64_t> ret_rows{2, 3, 5};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output_cpu.data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    for (size_t j = 0; j < row_numel; ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 235b5405fb7d016f4bd8c738f75b303522183116..6d491dbf1ed162ef07fda4c07e95cc57108486fd 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
+template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
     }
   }
 };
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
 
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
+                  dim * sizeof(T));
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+          }
+        }
+      }
+    }
+  }
+};
 template <typename T>
 class MaxSeqPoolGradFunctor {
  public:
@@ -157,17 +191,47 @@ class FirstSeqPoolFunctor {
   }
 };
 
+template <typename T>
+class SumSeqPoolGradFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad) {
+    auto lod = in_grad->lod()[0];
+    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
+    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
+    PADDLE_ENFORCE(in_w == out_w);
+    const T* out_g_data = out_grad.data<T>();
+    T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t in_offset = lod[i] * in_w;
+      const T* out_pos = out_g_data + i * out_w;
+      T* in_pos = in_g_data + in_offset;
+      for (int r = 0; r != h; ++r) {
+        blas.VCOPY(in_w, out_pos, in_pos + r * in_w);
+      }
+    }
+  }
+};
+
 template <typename T>
 class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  public:
   /* max pool has index output */
   void operator()(const platform::CPUDeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<T> max_pool;
-      max_pool(context, input, output, index);
+      if (is_test) {
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, output, index);
+      }
       return;
     }
     if (pooltype == "LAST") {
@@ -175,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       last_pool(context, input, output);
       return;
     }
+
     if (pooltype == "FIRST") {
       math::FirstSeqPoolFunctor<T> first_pool;
       first_pool(context, input, output);
@@ -231,9 +296,15 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       math::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(context, in_grad, 0);
     }
+
+    if (pooltype == "SUM") {
+      math::SumSeqPoolGradFunctor<T> sum_pool_grad;
+      sum_pool_grad(context, out_grad, in_grad);
+      return;
+    }
+
     auto lod = in_grad->lod()[0];
     auto& place = *context.eigen_device();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                    static_cast<int>(lod[i + 1]));
@@ -247,12 +318,6 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
 
       if (pooltype == "AVERAGE") {
         in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        const T* out_g_data = out_g_t.data<T>();
-        T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
-        for (int r = 0; r != h; ++r) {
-          blas.VCOPY(w, out_g_data, in_g_data + r * w);
-        }
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index a92aef805a0434f2ebcbc62d4e5eaef0cfb21bfa..0015fafbc892912424dfa6dbd1778438d384ca19 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     auto& lod = input.lod()[0];
     const size_t item_dim = output->numel() / output->dims()[0];
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 8dcbee65d0b63a137e5f422ec8667cc950641b4a..a1046ea2160d0ae9c2251612c97d3f2640b0aad1 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
   /* max pool has index output */
   void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index = nullptr);
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2bc008dd34ffcfe93a00bd4a8cde61626d91e235
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
+  paddle::framework::LoDTensor cpu_out_grad;
+  paddle::framework::LoDTensor cpu_in_grad;
+  paddle::framework::LoDTensor out_grad;
+  paddle::framework::LoDTensor in_grad;
+  const size_t second_dim = 128u;
+
+  // construct out_grad's tensor in cpu
+  const size_t out_first_dim = lod[0].size() - 1;
+  auto out_dims = paddle::framework::make_ddim(
+      {static_cast<int64_t>(out_first_dim), static_cast<int64_t>(second_dim)});
+
+  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
+  for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
+    cpu_out_grad.data<T>()[i] = static_cast<T>(i);
+  }
+
+  // copy to dst out_grad
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_grad = cpu_out_grad;
+  } else {
+    TensorCopySync(cpu_out_grad, *place, &out_grad);
+  }
+
+  // construct in_grad
+  in_grad.set_lod(lod);
+  auto in_dims = paddle::framework::make_ddim(
+      {static_cast<int64_t>(lod[0].back()), static_cast<int64_t>(second_dim)});
+  in_grad.mutable_data<T>(in_dims, context->GetPlace());
+
+  // check tensor contruction result
+  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
+    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+  }
+
+  // call functor
+  paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
+      *context, "SUM", out_grad, &in_grad);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_in_grad = in_grad;
+  } else {
+    TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
+    cpu_in_grad.set_lod(in_grad.lod());
+  }
+
+  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.lod(), lod);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
+      int64_t begin = in_grad.lod()[0][i];
+      int64_t end = in_grad.lod()[0][i + 1];
+      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
+      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+        for (int64_t m = 0; m != second_dim; ++m) {
+          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
+                    out_grad.data<T>()[m + i * second_dim]);
+        }
+      }
+    }
+  } else {
+    for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
+      int64_t begin = cpu_in_grad.lod()[0][i];
+      int64_t end = cpu_in_grad.lod()[0][i + 1];
+      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
+      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+        for (int64_t m = 0; m != second_dim; ++m) {
+          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
+                    cpu_out_grad.data<T>()[m + i * second_dim]);
+        }
+      }
+    }
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(SequencePoolingGrad, CPU_SUM) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
+                         paddle::platform::CPUPlace, float>(lod1);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
+                         paddle::platform::CPUPlace, float>(lod2);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePoolingGrad, CUDA_SUM) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
+                         paddle::platform::CUDAPlace, float>(lod1);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
+                         paddle::platform::CUDAPlace, float>(lod2);
+}
+#endif
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 9e0bebd17c02a3ce010b77142757b8789cfbcdd9..19426b3c204095bd415cebcd87cff18468acd564 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of mean op");
-    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
+    AddOutput("Out", "(Tensor) The output of mean op");
     AddComment(R"DOC(
 Mean Operator calculates the mean of all elements in X.
 
diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc
index c6ec4ab047d5e91625e646fd26108d2e477cdce5..6e0e13698097ade36449f2e8ff6ab981a1b24311 100644
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -20,13 +20,16 @@ namespace operators {
 class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
-    AddInput(
-        "X",
-        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
-        "size of embedding table")
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
+        .AsDuplicable();
+    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
+        .AsDuplicable();
+    AddInput("X",
+             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
+             "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
         .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
 
     AddComment(R"DOC(
 Merge multi LoDTensor's into one according to Ids's shard num.
@@ -79,15 +82,19 @@ class MergeIdsOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
+                   "MergeIdsOp must has multi input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("Rows"),
+                   "MergeIdsOp must has multi input Rows.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has multi input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "MergeIdsOp must has multi output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputDim("Ids");
+    auto ids_dims = ctx->GetInputsDim("Ids");
     if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[0][1], 1);
     }
     auto x_var_type = ctx->GetInputsVarType("X");
     for (auto &var_type : x_var_type) {
diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h
index 83712a8519c6817151e1922c606c0fdd4682a2db..fef9e023d02f45e21ec409ad398ba7d9bdd36880 100644
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <tuple>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -30,59 +32,70 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     if (!platform::is_cpu_place(place)) {
       PADDLE_THROW("MergeIds do not support GPU kernel");
     }
-    VLOG(3) << "run in MergeIdsOpKernel";
 
-    const auto *ids_var = ctx.InputVar("Ids");
-    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
-                   "only support to merge Ids of LoDTensor");
+    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
+    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
+    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
 
-    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
-    const auto &ids_dims = ids_tensor.dims();
-    const int64_t *ids = ids_tensor.data<int64_t>();
+    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
+                      "the number of Rows and X should be the same");
+    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
+                      "the number of Ids and Out should be the same");
 
-    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    int row_ids_size = 0;
+    int row_size = 0;
+    int embedding_size = 0;
 
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    for (int i = 0; i < x_tensors.size(); ++i) {
+      const auto *x_tensor = x_tensors[i];
+      const auto *row_id = row_ids[i];
 
-    int batch_size = 0;
-    int embedding_size = 0;
-    for (auto &input : x_tensors) {
-      if (framework::product(input->dims()) != 0) {
-        if (embedding_size == 0) {
-          embedding_size = input->dims()[1];
-        }
-        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
-                          "embedding size of all input should be the same");
-        batch_size += input->dims()[0];
+      if (embedding_size == 0) {
+        embedding_size = x_tensor->dims()[1];
       }
+      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
+                        "embedding size of all input should be the same");
+      row_size += x_tensor->dims()[0];
+      row_ids_size += row_id->dims()[0];
     }
+
     PADDLE_ENFORCE_EQ(
-        batch_size, ids_dims[0],
-        "the batch size of ids and merged embedding value should be the same");
+        row_size, row_ids_size,
+        "the merged X dim[0] and merged Rows dim[0] should be the same");
+
+    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
+        selected_rows_idx_map;
+    for (int i = 0; i < x_tensors.size(); ++i) {
+      const auto *row_id = row_ids[i];
+
+      for (int j = 0; j < row_id->numel(); ++j) {
+        int64_t key = row_id->data<int64_t>()[j];
+        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
+        selected_rows_idx_map.insert(std::make_pair(key, val));
+      }
+    }
+    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
+                      "the rows and tensor map size should be the same");
+
+    for (int i = 0; i < outs.size(); ++i) {
+      auto *out_ids = ids[i];
+      auto *out = outs[i];
 
-    const size_t shard_num = x_tensors.size();
+      out->set_lod(out_ids->lod());
 
-    if (shard_num == 1) {
-      VLOG(3) << "only one shard, we can copy the data directly";
-      TensorCopy(*x_tensors[0], place, out);
-    } else {
-      std::vector<int> in_indexs(shard_num, 0);
+      int nums = static_cast<int>(out_ids->dims()[0]);
       auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({batch_size, embedding_size}), place);
-      // copy data from ins[shard_num] to out.
-      for (int i = 0; i < ids_dims[0]; ++i) {
-        int64_t id = ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        int index = in_indexs[shard_id];
-        memcpy(out_data + embedding_size * i,
-               x_tensors[shard_id]->data<T>() + index * embedding_size,
+          framework::make_ddim({nums, embedding_size}), place);
+      for (int j = 0; j < nums; ++j) {
+        int id = out_ids->data<int64_t>()[j];
+        auto row_tuple = selected_rows_idx_map[id];
+        int64_t row_idx = std::get<1>(row_tuple);
+        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
+
+        memcpy(out_data + embedding_size * j,
+               x_tensor->data<T>() + row_idx * embedding_size,
                sizeof(T) * embedding_size);
-        in_indexs[shard_id] += 1;
-      }
-
-      for (size_t i = 0; i < shard_num; ++i) {
-        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
-                          "after merge, all data in x_tensor should be used");
       }
     }
   }
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index c8079a99fb8c8e05144c5390794db7757e74f6ae..7f0b51580aa2591ac7338ad7c29ee4756d909925 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -19,49 +19,25 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-class MomentumOp : public framework::OperatorWithKernel {
+class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and Grad input of MomentumOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Velocity"),
-        "Param and Velocity of MomentumOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
-                      "Learning_rate should be a scalar");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("VelocityOut", param_dim);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto input_var = op_desc.Input("Param")[0];
+    for (auto& out_var : op_desc.Output("ParamOut")) {
+      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+          framework::proto::VarType::SELECTED_ROWS) {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::SELECTED_ROWS);
+      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+                 framework::proto::VarType::LOD_TENSOR) {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::LOD_TENSOR);
+      } else {
+        PADDLE_THROW(
+            "Only support LodTensor and SelectedRows, Unexpected Input Type.");
+      }
+    }
   }
 };
 
@@ -115,6 +91,9 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
-                       ops::MomentumOpKernel<double>);
+REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::MomentumOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index 5dc920c70979ad5c3d4fb3acc8d2dacaffe386c0..b68fec34d43f0dee834f1045f192d5c6089d9356 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -15,76 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/momentum_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MomentumKernel(const T* p, const T* g, const T* v,
-                               const T* learning_rate, const T mu,
-                               const int64_t num, bool use_nesterov, T* p_out,
-                               T* v_out) {
-  T lr = learning_rate[0];
-  if (use_nesterov) {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T g_val = g[i];
-      T v_new = v[i] * mu + g_val;
-      v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
-    }
-  } else {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T v_new = v[i] * mu + g[i];
-      v_out[i] = v_new;
-      p_out[i] = p[i] - lr * v_new;
-    }
-  }
-}
-
-template <typename T>
-class MomentumOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
-
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
-    auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
-
-    int block = 512;
-    int grid = (param->numel() + block - 1) / block;
-    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
-                        ops::MomentumOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 40073d21b7186ad319e4988e667750c267581300..71f079e4d97f5259359ee6572f584894551452ca 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -13,35 +13,96 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class MomentumOpKernel : public framework::OpKernel<T> {
+using framework::Tensor;
+using framework::SelectedRows;
+struct NoNesterov;
+struct UseNesterov;
+
+class MomentumOp : public framework::OperatorWithKernel {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(param) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(grad) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
+                   "Input(velocity) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of Momentum should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
 
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
+                   "Output(VelocityOut) of Momentum should not be null.");
 
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto param_dim = ctx->GetInputDim("Param");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Grad"),
+          "Param and Grad input of MomentumOp should have the same dimension.");
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Velocity"),
+          "Param and Velocity of MomentumOp should have the same dimension.");
+    }
+    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
+                      "Learning_rate should be a scalar");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CPUDenseMomentumFunctor {
+ private:
+  const Tensor* param;
+  const Tensor* grad;
+  const Tensor* velocity;
+  const Tensor* learning_rate;
+  const T mu;
+  const T use_nesterov;
+  Tensor* param_out;
+  Tensor* velocity_out;
+
+ public:
+  CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad,
+                          const Tensor* velocity, const Tensor* learning_rate,
+                          const T mu, const bool use_nesterov,
+                          Tensor* param_out, Tensor* velocity_out)
+      : param(param),
+        grad(grad),
+        velocity(velocity),
+        learning_rate(learning_rate),
+        mu(mu),
+        use_nesterov(use_nesterov),
+        param_out(param_out),
+        velocity_out(velocity_out) {}
 
+  inline void operator()() {
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
 
@@ -59,5 +120,283 @@ class MomentumOpKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T, typename UpdateMethod>
+class DenseMomentumFunctor;
+
+// NOTE(dzh) for performance.
+// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
+// functor.
+template <typename T>
+class DenseMomentumFunctor<T, UseNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t num_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  DenseMomentumFunctor(const T* p, const T* g, const T* v,
+                       const T* learning_rate, const T mu, const int64_t num,
+                       T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(learning_rate),
+        mu_(mu),
+        num_(num),
+        p_out_(p_out),
+        v_out_(v_out) {}
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // put memory access in register
+    const T p = p_[i];
+    const T g = g_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - (g + v_out * mu_) * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T>
+class DenseMomentumFunctor<T, NoNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t num_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  DenseMomentumFunctor(const T* p, const T* g, const T* v,
+                       const T* learning_rate, const T mu, const int64_t num,
+                       T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(learning_rate),
+        mu_(mu),
+        num_(num),
+        p_out_(p_out),
+        v_out_(v_out) {}
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // put memory access in register
+    const T p = p_[i];
+    const T g = g_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - lr * v_out;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T, typename UpdateMethod>
+class SparseMomentumFunctor;
+
+template <typename T>
+class SparseMomentumFunctor<T, UseNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t* rows_;
+  const int64_t row_numel_;
+  const int64_t row_height_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
+                        const T mu, const int64_t* rows, int64_t row_numel,
+                        int64_t row_height, T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(lr),
+        mu_(mu),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_height_(row_height),
+        p_out_(p_out),
+        v_out_(v_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    // put memory access in register
+    const T p = p_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - (g + v_out * mu_) * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T>
+class SparseMomentumFunctor<T, NoNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t* rows_;
+  const int64_t row_numel_;
+  const int64_t row_height_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
+                        const T mu, const int64_t* rows, int64_t row_numel,
+                        int64_t row_height, T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(lr),
+        mu_(mu),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_height_(row_height),
+        p_out_(p_out),
+        v_out_(v_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    // put memory access in register
+    const T p = p_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - v_out * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    auto* grad_var = ctx.InputVar("Grad");
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto grad = ctx.Input<framework::Tensor>("Grad");
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        CPUDenseMomentumFunctor<T> functor(param, grad, velocity, learning_rate,
+                                           mu, use_nesterov, param_out,
+                                           velocity_out);
+        functor();
+      } else if (platform::is_gpu_place(ctx.GetPlace())) {
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(ctx.device_context()),
+            param->numel());
+        if (use_nesterov) {
+          DenseMomentumFunctor<T, UseNesterov> functor(
+              param->data<T>(), grad->data<T>(), velocity->data<T>(),
+              learning_rate->data<T>(), mu, param->numel(),
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              velocity_out->mutable_data<T>(ctx.GetPlace()));
+          for_range(functor);
+
+        } else {
+          DenseMomentumFunctor<T, NoNesterov> functor(
+              param->data<T>(), grad->data<T>(), velocity->data<T>(),
+              learning_rate->data<T>(), mu, param->numel(),
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              velocity_out->mutable_data<T>(ctx.GetPlace()));
+          for_range(functor);
+        }
+      }
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // sparse update embedding with selectedrows
+      auto grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      // sparse update maybe empty.
+      if (grad->rows().size() == 0) {
+        VLOG(3) << "Grad SelectedRows contains no data!";
+        return;
+      }
+      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(ctx.template device_context<DeviceContext>(), *grad,
+                 merged_grad);
+
+      const int64_t* rows = nullptr;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      int64_t row_numel =
+          merged_grad->value().numel() / merged_grad->rows().size();
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param->numel());
+      if (use_nesterov) {
+        SparseMomentumFunctor<T, UseNesterov> functor(
+            param->data<T>(), merged_grad->value().data<T>(),
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
+            static_cast<int64_t>(merged_grad->rows().size()),
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            velocity_out->mutable_data<T>(ctx.GetPlace()));
+        for_range(functor);
+
+      } else {
+        SparseMomentumFunctor<T, NoNesterov> functor(
+            param->data<T>(), merged_grad->value().data<T>(),
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
+            static_cast<int64_t>(merged_grad->rows().size()),
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            velocity_out->mutable_data<T>(ctx.GetPlace()));
+        for_range(functor);
+      }
+    } else {
+      PADDLE_THROW(
+          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
+                          "gradient, but the received Variable Type is %s",
+                          grad_var->Type().name()));
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index f8ad63690e84339da0390d4ddd2db45f25db385a..24a5346b031008531fcefff0e6f1c31da33d1c3b 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -151,8 +151,7 @@ void Pool2dOpMaker::Make() {
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
-            "and W is the width of the feature.")
-      .Reuse("X");
+            "and W is the width of the feature.");
 
   AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
@@ -252,8 +251,7 @@ void Pool3dOpMaker::Make() {
             "The format of output tensor is also NCDHW, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.")
-      .Reuse("X");
+            "width of the feature, respectively.");
 
   AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc..51b980acb5a08d431d96a3a92479dec09119c27e 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -31,8 +31,8 @@ class BlockingQueue {
   // is a workaround and a simplified version of framework::Channel as it
   // doesn't support GPU and it implements on buffered blocking queue.
  public:
-  explicit BlockingQueue(size_t capacity)
-      : capacity_(capacity), closed_(false) {
+  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
         capacity_, 0,
         "The capacity of a reader::BlockingQueue must be greater than 0.");
@@ -72,7 +72,9 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(elem);
       *elem = queue_.front();
-      queue_.pop_front();
+      if (LIKELY(!speed_test_mode_)) {
+        queue_.pop_front();
+      }
       send_cv_.notify_one();
       return true;
     } else {
@@ -114,6 +116,7 @@ class BlockingQueue {
 
  private:
   size_t capacity_;
+  bool speed_test_mode_;
   bool closed_;
   std::deque<T> queue_;
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 4f7cfc24ec035349f3c85e84d876ad9b5b5493a6..3f041ff7e4e32b407729a22aab25d3aab199fee0 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
 
  private:
   LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims)
-      : queue_(capacity), dims_(dims) {}
+                         const std::vector<framework::DDim>& dims,
+                         bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode), dims_(dims) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284..dc0940ac0b78d295b5088cb6ae26300da1dc883d 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
   q.Receive(&b);
   EXPECT_EQ(a.val_, b.val_);
 }
+
+TEST(BlockingQueue, speed_test_mode) {
+  size_t queue_size = 10;
+  BlockingQueue<size_t> q1(queue_size, false);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Send(i);
+  }
+  size_t b;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q1.Receive(&b);
+    EXPECT_EQ(b, i);
+  }
+  EXPECT_EQ(q1.Size(), 0UL);
+
+  BlockingQueue<size_t> q2(queue_size, true);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Send(i);
+  }
+  for (size_t i = 0; i < queue_size; ++i) {
+    q2.Receive(&b);
+    EXPECT_EQ(b, 0UL);
+  }
+  EXPECT_EQ(q2.Size(), queue_size);
+}
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index d72f85f2c44db2fa887732cfc05e1376a6a79e4a..500d86fec33830fc2cfb0412f1f2c7780d08eb02 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in 
+Attr(shape) still should be set correctly to gurantee shape inference in
 compile-time.
 
 )DOC");
@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
       : ReshapeOp(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ReshapeOp::InferShape(ctx);
     PADDLE_ENFORCE(ctx->HasOutput("XShape"),
                    "Output(XShape) of ReshapeOp should not be null.");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
     }
     ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
     ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index a04d1bd2ca5128714d10155a9679d921519cfe07..797cd45fdcdbd5c3567d1676f37e148304ee6e2d 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,72 +13,254 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+
+  const T *grad_;
+};
+
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
+                                  int64_t row_numel, int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
+                           T epsilon, T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
+                         T rho, T epsilon, T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
 template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
-
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
-
-    auto grad = ctx.Input<Tensor>("Grad");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    float epsilon = ctx.Attr<float>("epsilon");
-    float rho = ctx.Attr<float>("decay");
-    float momentum = ctx.Attr<float>("momentum");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using LoDTensor = framework::LoDTensor;
+    auto *grad_var = ctx.InputVar("Grad");
+    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
+
+    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
+    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
     bool centered = ctx.Attr<bool>("centered");
 
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
-
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    if (centered) {
-      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
-      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
-      mean_grad_out->mutable_data<T>(ctx.GetPlace());
-      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-
-      mg_out.device(place) = rho * mg + (1 - rho) * g;
-      mom_out.device(place) = momentum * mom +
-                              lr.broadcast(grad_dsize) * g /
-                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
+    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
+    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
+
+    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
+                      "Param and ParamOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
+                      "Moment and MomentOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
+                      "MeanSquare and MeanSquareOut must be the same Tensor");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(ms_tensor.numel());
+
+    if (grad_var->IsType<LoDTensor>()) {
+      auto &grad_tensor = grad_var->Get<LoDTensor>();
+
+      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
+        auto &place =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+        auto lr_value = lr_tensor.data<T>()[0];
+
+        auto p = EigenVector<T>::Flatten(p_tensor);
+        auto ms = EigenVector<T>::Flatten(ms_tensor);
+        auto g = EigenVector<T>::Flatten(grad_tensor);
+        auto mom = EigenVector<T>::Flatten(mom_tensor);
+
+        auto p_out = EigenVector<T>::Flatten(*param_out);
+        auto mom_out = EigenVector<T>::Flatten(*moment_out);
+        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+
+        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto mg = EigenVector<T>::Flatten(mg_tensor);
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+          mg_out.device(place) = rho * mg + (1 - rho) * g;
+          mom_out.device(place) =
+              momentum * mom +
+              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+        } else {
+          mom_out.device(place) =
+              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+        }
+        p_out.device(place) = p - mom_out;
+      } else {
+        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()),
+              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
+              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
+        } else {
+          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+              rho, epsilon, momentum, grad_func));
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto &grad = grad_var->Get<framework::SelectedRows>();
+      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(dev_ctx, grad, merged_grad);
+
+      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+      const int64_t *rows;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      auto &merged_tensor = merged_grad->value();
+      int64_t row_count = merged_grad->rows().size();
+      int64_t row_numel = merged_tensor.numel() / row_count;
+      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
+                                            row_numel, row_count);
+
+      if (centered) {
+        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
+        PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                       "MeanGrad and MeanGradOut must be the same Tensor");
+        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()),
+            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      } else {
+        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      }
     } else {
-      mom_out.device(place) =
-          momentum * mom +
-          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
     }
-    p_out.device(place) = p - mom_out;
   }
 };
 
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c57a34c3a745e8fc03ca57dce478ecf60058a9a9
--- /dev/null
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/roi_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class ROIAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIAlignOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                   "given as [[x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == 4,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                   "given as [[x1, y1, x2, y2], …].");
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIAlignGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The GRAD@Out of ROIAlignGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The GRAD@X of ROIAlignGradOp should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "The input of ROIAlignOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D LoDTensor of shape (num_rois, 4)"
+             "given as [[x1, y1, x2, y2], …]. "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIAlignOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddAttr<int>("sampling_ratio",
+                 "(int,default -1),"
+                 "number of sampling points in the interpolation grid"
+                 "If <=0, then grid points are adaptive to roi_width "
+                 "and pooled_w, likewise for height")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+**RoIAlign Operator**
+
+Region of interest align (also known as RoI align) is to perform
+bilinear interpolation on inputs of nonuniform sizes to obtain 
+fixed-size feature maps (e.g. 7*7)
+
+Dividing each region proposal into equal-sized sections with
+the pooled_width and pooled_height. Location remains the origin
+result.
+
+In each ROI bin, the value of the four regularly sampled locations 
+are computed directly through bilinear interpolation. The output is
+the mean of four locations.
+Thus avoid the misaligned problem.   
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_align,
+    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_align_grad,
+    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bcec6f3563df7f4e1e48554cc891d596f9e56024
--- /dev/null
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/roi_align_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <class T>
+__device__ T BilinearInterpolate(const T* input_data, const int height,
+                                 const int width, T y, T x) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <class T>
+__device__ void BilinearInterpolateGradient(const int height, const int width,
+                                            T y, T x, T* w1, T* w2, T* w3,
+                                            T* w4, int* x_low, int* x_high,
+                                            int* y_low, int* y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return;
+  }
+
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low, lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+__global__ void GPUROIAlignForward(
+    const int nthreads, const T* input_data, const T* input_rois,
+    const float spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_xmin = offset_input_rois[0] * spatial_scale;
+    T roi_ymin = offset_input_rois[1] * spatial_scale;
+    T roi_xmax = offset_input_rois[2] * spatial_scale;
+    T roi_ymax = offset_input_rois[3] * spatial_scale;
+
+    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
+    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    T output_val = 0;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = BilinearInterpolate(offset_input_data, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+    output_data[i] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
+                                    const T* out_grad, const int num_rois,
+                                    const float spatial_scale,
+                                    const int channels, const int height,
+                                    const int width, const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data, T* input_grad) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_xmin = offset_input_rois[0] * spatial_scale;
+    T roi_ymin = offset_input_rois[1] * spatial_scale;
+    T roi_xmax = offset_input_rois[2] * spatial_scale;
+    T roi_ymax = offset_input_rois[3] * spatial_scale;
+
+    T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
+    T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_input_grad =
+        input_grad + (roi_batch_ind * channels + c) * height * width;
+
+    const T* offset_out_grad =
+        out_grad + (n * channels + c) * pooled_height * pooled_width;
+    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
+        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
+        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
+                                    &x_low, &x_high, &y_low, &y_high);
+        T diff1 = out_grad_this_bin * w1 / count;
+        T diff2 = out_grad_this_bin * w2 / count;
+        T diff3 = out_grad_this_bin * w3 / count;
+        T diff4 = out_grad_this_bin * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
+                                  diff1);
+          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
+                                  diff2);
+          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
+                                  diff3);
+          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
+                                  diff4);
+        }
+      }
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIAlignOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    int rois_num = rois->dims()[0];
+
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and imgs batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    Tensor roi_batch_id_list_gpu;
+    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
+                              &roi_batch_id_list_gpu);
+    GPUROIAlignForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
+        height, width, pooled_height, pooled_width, sampling_ratio,
+        roi_batch_id_list_gpu.data<int>(),
+        out->mutable_data<T>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+
+    int rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (!in_grad) {
+      return;
+    }
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    Tensor roi_batch_id_list_gpu;
+    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
+                              &roi_batch_id_list_gpu);
+
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<Place, T> set_zero;
+    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
+
+    int output_grad_size = out_grad->numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPUROIAlignBackward<
+          T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
+          spatial_scale, channels, height, width, pooled_height, pooled_width,
+          sampling_ratio, roi_batch_id_list_gpu.data<int>(),
+          in_grad->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    roi_align,
+    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    roi_align_grad,
+    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a18aee1b86283cbb48f0b804ccfc476d7cd78f3b
--- /dev/null
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -0,0 +1,332 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <limits>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kROISize = 4;
+
+template <class T>
+void PreCalcForBilinearInterpolate(
+    const platform::DeviceContext& ctx, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int iy_upper,
+    const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) {
+  int pre_calc_index = 0;
+  int* pre_pos_data = pre_pos->mutable_data<int>(ctx.GetPlace());
+  T* pre_w_data = pre_w->mutable_data<T>(ctx.GetPlace());
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        // calculate y of sample points
+        T y = roi_ymin + ph * bin_size_h +
+              static_cast<T>(iy + .5f) * bin_size_h /
+                  static_cast<T>(roi_bin_grid_h);
+        // calculate x of samle points
+        for (int ix = 0; ix < ix_upper; ix++) {
+          T x = roi_xmin + pw * bin_size_w +
+                static_cast<T>(ix + .5f) * bin_size_w /
+                    static_cast<T>(roi_bin_grid_w);
+          // deal with elements out of map
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            for (int i = 0; i < kROISize; ++i) {
+              pre_pos_data[i + pre_calc_index * kROISize] = 0;
+              pre_w_data[i + pre_calc_index * kROISize] = 0;
+            }
+            pre_calc_index += 1;
+            continue;
+          }
+          y = y <= 0 ? 0 : y;
+          x = x <= 0 ? 0 : x;
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = static_cast<T>(y_low);
+          } else {
+            y_high = y_low + 1;
+          }
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = static_cast<T>(x_low);
+          } else {
+            x_high = x_low + 1;
+          }
+          T ly = y - y_low, lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
+          pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
+          pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
+          pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
+          pre_w_data[pre_calc_index * kROISize] = hy * hx;
+          pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
+          pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
+          pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <class T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   const T out_grad_this_bin, const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class CPUROIAlignOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and imgs batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* rois_data = rois->data<T>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = roi_batch_id_data[n];
+      T roi_xmin = rois_data[0] * spatial_scale;
+      T roi_ymin = rois_data[1] * spatial_scale;
+      T roi_xmax = rois_data[2] * spatial_scale;
+      T roi_ymax = rois_data[3] * spatial_scale;
+
+      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
+      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      int roi_bin_grid_h = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceil(roi_height / pooled_height);
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceil(roi_width / pooled_width);
+      const T count = roi_bin_grid_h * roi_bin_grid_w;
+      Tensor pre_pos;
+      Tensor pre_w;
+      int pre_size = count * out_stride[1];
+      pre_pos.Resize({pre_size, kROISize});
+      pre_w.Resize({pre_size, kROISize});
+
+      PreCalcForBilinearInterpolate(
+          dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h,
+          roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w,
+          roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w);
+      const int* pre_pos_data = pre_pos.data<int>();
+      const T* pre_w_data = pre_w.data<T>();
+      for (int c = 0; c < channels; c++) {
+        int pre_calc_index = 0;
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            const int pool_index = ph * pooled_width + pw;
+            T output_val = 0;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                for (int i = 0; i < kROISize; i++) {
+                  int pos = pre_pos_data[pre_calc_index * kROISize + i];
+                  T w = pre_w_data[pre_calc_index * kROISize + i];
+                  output_val += w * batch_data[pos];
+                }
+                pre_calc_index += 1;
+              }
+            }
+            output_val /= count;
+            output_data[pool_index] = output_val;
+          }
+        }
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+      }
+      rois_data += roi_stride[0];
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto in_dims = in->dims();
+    if (!in_grad) {
+      return;
+    }
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    const T* rois_data = rois->data<T>();
+    const T* out_grad_data = out_grad->data<T>();
+    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto in_stride = framework::stride(in->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out_grad->dims());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = roi_batch_id_data[n];
+      T roi_xmin = rois_data[0] * spatial_scale;
+      T roi_ymin = rois_data[1] * spatial_scale;
+      T roi_xmax = rois_data[2] * spatial_scale;
+      T roi_ymax = rois_data[3] * spatial_scale;
+      T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
+      T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+      for (int c = 0; c < channels; ++c) {
+        T* batch_grad_data =
+            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
+        const T* batch_out_grad_data =
+            out_grad_data + n * out_stride[0] + c * out_stride[1];
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            T out_grad_this_bin = batch_out_grad_data[pool_index];
+            int roi_bin_grid_h = (sampling_ratio > 0)
+                                     ? sampling_ratio
+                                     : ceil(roi_height / pooled_height);
+            int roi_bin_grid_w = (sampling_ratio > 0)
+                                     ? sampling_ratio
+                                     : ceil(roi_width / pooled_width);
+            T count = roi_bin_grid_h * roi_bin_grid_w;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              const T y = roi_ymin + ph * bin_size_h +
+                          static_cast<T>(iy + .5f) * bin_size_h /
+                              static_cast<T>(roi_bin_grid_h);
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                const T x = roi_xmin + pw * bin_size_w +
+                            static_cast<T>(ix + .5f) * bin_size_w /
+                                static_cast<T>(roi_bin_grid_w);
+                bilinear_interpolate_gradient(height, width, y, x,
+                                              out_grad_this_bin, count,
+                                              batch_grad_data);
+              }
+            }
+          }
+        }
+      }
+      rois_data += roi_stride[0];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index d6d209d5de041500a9b4893d70800a58e8ee1e1d..8e29761ec208764e263e357a0b3c9456c932d093 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -174,4 +174,4 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 46e20285db6d7acd39dead3994409645adddf494..75c3dd6bc498e35c6249f79a1c24cfe17316670e 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -249,4 +249,4 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
     roi_pool_grad,
     ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 397a3182953e3f1afaeadeff6d53a4f22fb95d26..3234b60861da3d0c6a8434eb11fd0488a95e171f 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>);
+REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
+                       Kernel<int64_t>);
+
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
-                       GradKernel<double>);
+                       GradKernel<double>, GradKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
index 33e9babff274af888b84d33c991cc0a5b70333ae..ff035f421c4907ba940b973b3fd2a9421ed2dbae 100644
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -17,7 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::ConcatGradFunctor<DeviceContext, T> functor;
+    math::SplitFunctor<DeviceContext, T> functor;
     std::vector<const framework::Tensor *> sliced_x_ptr;
     std::vector<framework::Tensor *> sliced_dx_ptr;
     for (auto &x : sliced_x) {
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 15d3f064eb7b025dc9a85b2aabad24186061cbd4..217bb1610fd3f02f0f72d3b7750ebcdfad243f48 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor<int>) This tensor is used for the sequence max-pooling "
               "to record the max indexes.")
         .AsIntermediate();
+    AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<std::string>(
         "pooltype",
         "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index 2aa20792f24305a106c500a3d7a6e3d363bc31d8..f2e4a55dee49664b2fc09813f6dba5f68aaf11d5 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
-    Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Output<Tensor>("MaxIndex");
-    }
 
     auto dims = in->dims();
     auto lod = in->lod();
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
     out->mutable_data<T>(context.GetPlace());
-    if (pooltype == "MAX") {
+    Tensor* index = nullptr;
+
+    const bool is_test = context.Attr<bool>("is_test");
+
+    // Do not create index buffer for inference (is_test) mode
+    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
+    if (pooltype == "MAX" &&
+        (is_test == false ||
+         platform::is_cpu_place(context.GetPlace()) == false)) {
+      index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
     }
     math::SequencePoolFunctor<DeviceContext, T> pool;
     pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
-         index);
+         is_test, index);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_reverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1428cca1a6bf6150594f9cb72dbf00cd0eff7df5
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_reverse_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseOp,
+                  ops::SequenceReverseOpMaker,
+                  ops::SequenceReverseGradOpDescMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_reverse,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_reverse_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ce65f4799e8661adca60d212eaa9c3f0f92c4c29
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_reverse_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reverse,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_reverse_op.h b/paddle/fluid/operators/sequence_reverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..39dad2311b2bcf29f808723caf7bfaef4c88cef2
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dim.size(), 2,
+                      "Rank of Input(X) must be not less than 2.");
+
+    ctx->SetOutputDim("Y", x_dim);
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class SequenceReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input LoDTensor of sequence_reverse op.");
+    AddOutput("Y", "The output LoDTensor of sequence_reverse op.");
+    AddComment(R"DOC(
+SequenceReverse Operator.
+
+Reverse each sequence in input X along dim 0.
+
+Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where:
+
+X.data() = [
+  [1, 2, 3, 4],
+  [5, 6, 7, 8], # the 0-th sequence with length 2
+  [9, 10, 11, 12],
+  [13, 14, 15, 16],
+  [17, 18, 19, 20] # the 1-st sequence with length 3
+]
+
+The output Y would be a LoDTensor sharing the same dims and lod with input X,
+and:
+
+Y.data() = [
+  [5, 6, 7, 8],
+  [1, 2, 3, 4], # the reversed 0-th sequence with length 2
+  [17, 18, 19, 20],
+  [13, 14, 15, 16],
+  [9, 10, 11, 12] # the reversed 1-st sequence with length 3
+]
+
+This Operator is useful to build a reverse dynamic RNN network.
+
+This Operator only supports one-level lod currently.
+    )DOC");
+  }
+};
+
+template <typename T>
+struct SequenceReverseFunctor {
+  SequenceReverseFunctor(const T *x, T *y, const size_t *lod, size_t lod_count,
+                         size_t row_numel)
+      : x_(x), y_(y), lod_(lod), lod_count_(lod_count), row_numel_(row_numel) {}
+
+  HOSTDEVICE void operator()(size_t idx_x) const {
+    auto row_idx_x = idx_x / row_numel_;
+    auto lod_idx = math::UpperBound(lod_, lod_count_, row_idx_x);
+    auto row_idx_y = lod_[lod_idx - 1] + (lod_[lod_idx] - 1 - row_idx_x);
+    auto idx_y = row_idx_y * row_numel_ + idx_x % row_numel_;
+    y_[idx_y] = x_[idx_x];
+  }
+
+  const T *x_;
+  T *y_;
+  const size_t *lod_;
+  size_t lod_count_;
+  size_t row_numel_;
+};
+
+template <typename DeviceContext, typename T>
+class SequenceReverseOpKernel : public framework::OpKernel<T> {
+  using LoDTensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &x = *ctx.Input<LoDTensor>("X");
+    auto *y = ctx.Output<LoDTensor>("Y");
+
+    PADDLE_ENFORCE_EQ(x.lod().size(), 1,
+                      "SequenceReverse Op only support one level lod.");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    const size_t *lod;
+    size_t lod_count = x.lod()[0].size();
+
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      lod = x.lod()[0].CUDAData(ctx.GetPlace());
+    } else {
+#endif
+      lod = x.lod()[0].data();
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    size_t limit = static_cast<size_t>(x.numel());
+    size_t row_numel = static_cast<size_t>(limit / x.dims()[0]);
+    auto *x_data = x.data<T>();
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE_NE(x_data, y_data,
+                      "SequenceReverse Op does not support in-place operation");
+
+    SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
+                                      row_numel);
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+
+class SequenceReverseGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_reverse");
+    op->SetInput("X", OutputGrad("Y"));
+    op->SetOutput("Y", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e633e378a226ece8adea2e150cc6c1e9aa874331
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceUnpadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceUnpadOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The rank of Input(X) can't be less than 2.");
+
+    auto len_dims = ctx->GetInputDim("Length");
+    PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1,
+                   "The shape of Input(Length) should be [batch_size, 1].");
+    PADDLE_ENFORCE(
+        len_dims[0] == x_dims[0],
+        "Input(X) and Input(Length) should have the same first dimension.");
+
+    int64_t out_dim_0 = -1;
+    if (ctx->IsRuntime()) {
+      out_dim_0 = x_dims[0] * x_dims[1];
+    }
+
+    std::vector<int64_t> out_dims_vec{out_dim_0};
+    if (x_dims.size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (int i = 2; i < x_dims.size(); ++i) {
+        out_dims_vec.push_back(x_dims[i]);
+      }
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) Input tensor which "
+             "contains the padded sequences with equal length.");
+    AddInput("Length",
+             "(LoDTensor) The input tensor which specifies the actual ength of "
+             "sequences after unpadding.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) The output tensor which contains unpadded sequences.");
+    AddComment(R"DOC(
+      Sequence Unpad Operator
+
+      This operator removes the padding data in the input sequences and convert 
+      them into sequences with actual length as output, identitied by lod 
+      information.
+
+      Example:
+
+      Given input tensor Input(X):
+          X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+                    [ 6.0,  7.0,  8.0,  9.0, 10.0],
+                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
+`     
+      in which there are 3 sequences padded to length 5, and the acutal length 
+      specified by Input(Length):
+
+          Length.data = [[2], [3], [4]],
+
+      after unpadding, Output(Out) will be:
+
+          Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+          Out.lod = [[0, 2, 5, 9]]      
+
+    )DOC");
+  }
+};
+
+class SequenceUnpadGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceUnpadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
+                  ops::SequenceUnpadOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..75248372237ec2cb23122f6b16e64f6ce750ebf9
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..07df3dca831d7e646050ae57402c1a493c2e50e9
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x_t = ctx.Input<LoDTensor>("X");
+    auto* len_t = ctx.Input<LoDTensor>("Length");
+    auto* out_t = ctx.Output<LoDTensor>("Out");
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    const int64_t* seq_len_ptr = nullptr;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      LoDTensor seq_len_cpu;
+      seq_len_cpu.Resize(len_t->dims());
+      seq_len_ptr = seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
+      framework::TensorCopy(*len_t, platform::CPUPlace(),
+                            ctx.template device_context<DeviceContext>(),
+                            &seq_len_cpu);
+    } else {
+      seq_len_ptr = len_t->data<int64_t>();
+    }
+
+    size_t batch_size = x_t->dims()[0];
+    std::vector<size_t> out_lod0(batch_size + 1, 0);
+    for (size_t i = 0; i < batch_size; ++i) {
+      out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+    }
+
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out_t->set_lod(out_lod);
+
+    std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
+    if (x_t->dims().size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (int i = 2; i < x_t->dims().size(); ++i) {
+        out_dims_vec.push_back(x_t->dims()[i]);
+      }
+    }
+    out_t->Resize(framework::make_ddim(out_dims_vec));
+
+    int64_t padded_length = x_t->dims()[1];
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *x_t, out_t,
+        padded_length, 0, false, math::kBatchLengthWidth);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (d_x) {
+      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      const auto* x_t = ctx.Input<LoDTensor>("X");
+      d_x->mutable_data<T>(ctx.GetPlace());
+
+      int padded_length = x_t->dims()[1];
+
+      LoDTensor zero_pads;
+      zero_pads.Resize({1, 1});
+      zero_pads.mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
+
+      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
+          padded_length, 0, false, math::kBatchLengthWidth);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 411a126bc8e2b3a8d25f436489c13970568ccae4..ea62acd08c5009556abf05c91726111870d1a462 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -77,8 +77,7 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
     AddOutput("ParamOut",
               "(Tensor or SelectedRows, same with Param) "
-              "Output parameter, should share the same memory with Param")
-        .Reuse("Param");
+              "Output parameter, should share the same memory with Param");
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 2bdb23e999621b10799b5163f326bc4b66a437e6..f6e241af0634650f4a32be6a4547617f8ec3ee60 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -76,6 +76,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<double>,
                    ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>);
+                   ops::SoftmaxGradCUDNNKernel<float>,
+                   ops::SoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index bb081238820b9ee3ae095442d21cfce11f7b41e5..a4bdbe6648afa7c91a056af4737bb5d826229022 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -80,8 +80,7 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of softmax, "
              "whose last dimension is the input_feature_dimensions.");
-    AddOutput("Out", "The normalized values with the same shape as X.")
-        .Reuse("X");
+    AddOutput("Out", "The normalized values with the same shape as X.");
     AddAttr<bool>(
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 1a9324ec862fc3dd7ce669c5fed94527cac22b8f..2900221485e6ec097796ac38936ce31f8382c86a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "numeric_stable_mode",
+        "(bool, default: false), A flag to indicate whether to use more "
+        "numerically stable algorithm. This flag is only valid when "
+        "soft_label is false and GPU is used.")
+        .SetDefault(false);
     AddAttr<int>(
         "ignore_index",
         "(int, default -100), Specifies a target value that is ignored and"
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index a07c17348ebb3f768d1c8be65c2d31e3c130bd23..6d48796191dd13a45f0c7267bfaf05489f528a9d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -117,8 +118,8 @@ using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
 // Make sure that BlockDim <= feature_size
 // This kernel is used to calculate the max element of each row
 template <typename T, int BlockDim>
-__global__ void RowReductionForMax(const T* logits_data, T* max_data,
-                                   int feature_size) {
+static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                          int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -141,9 +142,10 @@ __global__ void RowReductionForMax(const T* logits_data, T* max_data,
 }
 
 // Make sure that BlockDim <= feature_size
-template <typename T, int BlockDim>
-__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
-                                          T* softmax, int feature_size) {
+template <typename T, int BlockDim, bool CalculateLogSoftmax = false>
+static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
+                                                 T* max_data, T* softmax,
+                                                 int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -153,24 +155,34 @@ __global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
 
   softmax[beg_idx] = logits_data[beg_idx] - block_max;
   T diff_max_sum = real_exp(softmax[beg_idx]);
-  beg_idx += BlockDim;
-  while (beg_idx < end_idx) {
-    softmax[beg_idx] = logits_data[beg_idx] - block_max;
-    diff_max_sum += real_exp(softmax[beg_idx]);
-    beg_idx += BlockDim;
+  auto idx = beg_idx + BlockDim;
+  while (idx < end_idx) {
+    softmax[idx] = logits_data[idx] - block_max;
+    diff_max_sum += real_exp(softmax[idx]);
+    idx += BlockDim;
   }
 
   diff_max_sum =
       BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
   if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+
+  if (!CalculateLogSoftmax) return;
+  __syncthreads();
+  diff_max_sum = max_data[blockIdx.x];
+  softmax[beg_idx] -= diff_max_sum;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] -= diff_max_sum;
+    beg_idx += BlockDim;
+  }
+  if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
 }
 
 // Make sure that BlockDim <= feature_size
 template <typename T, int BlockDim>
-__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
-                                                      const T* labels_data,
-                                                      T* loss_data, T* softmax,
-                                                      int feature_size) {
+static __global__ void RowReductionForSoftmaxAndCrossEntropy(
+    const T* logits_data, const T* labels_data, T* loss_data, T* softmax,
+    int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -194,11 +206,134 @@ __global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
 }
 
 template <typename T>
-__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+struct HardLabelSoftmaxWithCrossEntropyFunctor {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctor(const T* logits,
+                                          const int64_t* labels, T* loss,
+                                          T* log_softmax, int feature_size)
+      : logits_(logits),
+        labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        feature_size_(feature_size) {}
+
+  __device__ void operator()(int idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    if (col_idx != labels_[row_idx]) {
+      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = real_exp(softmax);
+      loss_[row_idx] = -softmax;
+    }
+  }
+
+ private:
+  const T* logits_;
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int feature_size_;
+};
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const T* logits,
+                                                       const int64_t* labels,
+                                                       T* loss, T* log_softmax,
+                                                       int feature_size,
+                                                       int ignore_idx)
+      : logits_(logits),
+        labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        feature_size_(feature_size),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
+      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = real_exp(softmax);
+      loss_[row_idx] = -softmax;
+    }
+  }
+
+ private:
+  const T* logits_;
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int feature_size_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static __global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out,
+                                                           int batch_size) {
   auto idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) out[idx] = static_cast<T>(1);
 }
 
+template <typename T>
+static void HardLabelSoftmaxWithCrossEntropy(
+    const platform::CUDADeviceContext& ctx, const T* logits_data,
+    const int64_t* labels_data, T* loss_data, T* softmax_data, int batch_size,
+    int feature_size, int ignore_idx) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+  auto stream = ctx.stream();
+
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)    \
+  case BlockDim: {                                                           \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, feature_size);                               \
+    RowReductionForDiffMaxSum<T, BlockDim,                                   \
+                              true><<<batch_size, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, softmax_data, feature_size);                 \
+    platform::ForRange<platform::CUDADeviceContext> for_range(               \
+        ctx, batch_size* feature_size);                                      \
+    if (ignore_idx >= 0 && ignore_idx < feature_size) {                      \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(     \
+          logits_data, labels_data, loss_data, softmax_data, feature_size,   \
+          ignore_idx));                                                      \
+    } else {                                                                 \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                  \
+          logits_data, labels_data, loss_data, softmax_data, feature_size)); \
+    }                                                                        \
+  } break
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
                                                const T* labels_data,
@@ -237,7 +372,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
                                                 kMaxBlockDim,
                                             kMaxBlockDim, 0, stream>>>(
           softmax_data, batch_size);
-      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
       break;
     default:
       PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
@@ -272,11 +407,21 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           logits_data, labels_data, softmax_data, loss_data, batch_size,
           feature_size, context.cuda_device_context().stream());
     } else {
-      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
-                                     softmax);
-      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-          context.cuda_device_context(), loss, softmax, labels, false,
-          ignore_index);
+      if (!context.Attr<bool>("numeric_stable_mode")) {
+        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                       softmax);
+        math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+            context.cuda_device_context(), loss, softmax, labels, false,
+            ignore_index);
+      } else {
+        int batch_size = logits->dims()[0];
+        int feature_size = logits->dims()[1];
+        auto* logits_data = logits->data<T>();
+        auto* labels_data = labels->data<int64_t>();
+        HardLabelSoftmaxWithCrossEntropy<T>(
+            context.cuda_device_context(), logits_data, labels_data, loss_data,
+            softmax_data, batch_size, feature_size, ignore_index);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index c867c46873ae7ddbdbda280351e4ab28235bcc08..243f81e296fb95a2c7e9f717950b8a958ad98852 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -20,20 +20,27 @@ namespace operators {
 class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
-    AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
+        .AsDuplicable();
+
+    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
         .AsDuplicable();
 
     AddComment(R"DOC(
 Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
 Example:
   Input:
-    X = [1,2,3,4,5,6]
+    X = [[1,2,3,4,5,6],[2,3]]
 
   Out(3 output):
-    out0 = [3, 6]
-    out1 = [1, 4]
-    out2 = [2, 5]
+    if compress is True:
+        out0 = [3, 3, 6]
+        out1 = [1, 4]
+        out2 = [2, 2, 5]
+    else:
+        out0 = [3, 6]
+        out1 = [1, 4]
+        out2 = [2, 5]
 )DOC");
   }
 };
@@ -43,16 +50,24 @@ class SplitIdsOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("Ids"), "SplitIdsOp must has input Ids.");
     PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputDim("Ids");
+    auto ids_dims = ctx->GetInputsDim("Ids");
     if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("Ids").front()->type()),
+        ctx.GetPlace());
+  }
 };
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
@@ -66,12 +81,28 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
   }
 };
 
+class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
+    grad->SetType("concat");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("Ids"));
+    grad->SetAttr("axis", 0);
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
+                  ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType);
+
 REGISTER_OP_CPU_KERNEL(
     split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
     ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index c4af5a65fc5f81c1af7c1fdcca637ca37c940637..69ac6c5a6b9a8b318520eb9a3ff89a3a6be48339 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <iterator>
+#include <set>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -31,19 +33,39 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    const auto *ids_var = ctx.InputVar("Ids");
+    const auto ids_vars = ctx.MultiInputVar("Ids");
+
+    PADDLE_ENFORCE_GT(ids_vars.size(), 0, "The number of Ids should > 0");
+    auto *ids_var = ids_vars[0];
+
     if (ids_var->IsType<framework::LoDTensor>()) {
-      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      int batch_size = 0;
+      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
+      for (size_t i = 0; i < ids_tensors.size(); ++i) {
+        batch_size += ids_tensors[i]->dims()[0];
+      }
+      VLOG(4) << "Get Total BatchSize is: " << batch_size;
+
+      std::vector<T> all_ids(batch_size);
+      int offset = 0;
+      for (size_t i = 0; i < ids_tensors.size(); ++i) {
+        const auto *ids = ids_tensors[i];
+        std::memcpy(all_ids.data() + offset, ids->data<T>(),
+                    ids->numel() * sizeof(T));
+        offset += ids->numel();
+      }
+
+      std::set<T> st(all_ids.begin(), all_ids.end());
+      all_ids.assign(st.begin(), st.end());
+
       auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
       const size_t shard_num = outs.size();
-
       std::vector<std::vector<T>> out_ids;
       out_ids.resize(outs.size());
 
       // split id by their shard_num.
-      for (int i = 0; i < ids_dims[0]; ++i) {
-        T id = ids[i];
+      for (int i = 0; i < all_ids.size(); ++i) {
+        T id = all_ids[i];
         size_t shard_id = static_cast<size_t>(id) % shard_num;
         out_ids[shard_id].push_back(id);
       }
@@ -64,7 +86,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(ids_dims[0],
                         static_cast<int64_t>(ids_selected_rows->rows().size()),
                         "");
-      const T *ids = ids_selected_rows->value().data<T>();
+      const T *ids_data = ids_selected_rows->value().data<T>();
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
@@ -87,7 +109,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
           memcpy(output + i * row_width,
-                 ids + id_to_index[out->rows()[i]] * row_width,
+                 ids_data + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index d661b276bc31bf0c3ab181d706ffdccec89f0632..a05582ae09e16ee17194d299d713d321f28ccace 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -111,11 +111,10 @@ Example:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-USE_CPU_ONLY_OP(concat);
 
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
+REGISTER_OP_CPU_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index f0c417c70521b1bb3816f884d6ab7393473999e4..6f4a25ab5ed86937f2f5db532a9eba22b5a2c5be 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride_numel(in->dims());
-    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    int axis = ctx.Attr<int>("axis");
     auto place = ctx.GetPlace();
 
-    size_t input_offset = 0;
-    for (auto& out : outs) {
-      out->mutable_data<T>(ctx.GetPlace());
-      auto out_stride = framework::stride_numel(out->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride, out_stride[axis]);
-      input_offset += out_stride[axis];
+    std::vector<const framework::Tensor*> shape_refer;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      outs[j]->mutable_data<T>(ctx.GetPlace());
+      shape_refer.emplace_back(outs[j]);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && outs.size() < 10) {
+      StridedMemcpyWithAxis0<T>(dev_ctx, *in, shape_refer, &outs);
+    } else {
+      math::SplitFunctor<DeviceContext, T> functor;
+      functor(dev_ctx, *in, shape_refer, axis, &outs);
     }
   }
 };
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index 76615a9405d7a8e3fa9dba8d01a956209e02ae8f..0e7b1463d1ba81aed53e0e3f3a90d2a1fbf0ffbc 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -22,9 +22,9 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The input SelectedRows.");
     AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
 
     AddComment(R"DOC(
 Split a SelectedRows with a specified rows section.
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index 0e9ce165b98845f4745ee70b028513ea31cc6657..af64607fafc6544047714e731846a2440be219b8 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
+static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
     if (row < abs_sections[i]) {
       return i - 1;
@@ -30,9 +30,9 @@ static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
   return abs_sections.size() - 1;
 }
 
-static std::vector<int> ToAbsoluteSection(
-    const std::vector<int>& height_sections) {
-  std::vector<int> abs_sections;
+static std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
   abs_sections.resize(height_sections.size());
   abs_sections[0] = 0;
   for (size_t i = 1; i < height_sections.size(); ++i) {
@@ -47,7 +47,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::SelectedRows>("X");
     auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
 
     auto abs_sections = ToAbsoluteSection(height_sections);
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 7a10218e1556698f3e0a1828db5de8851dd1c90b..c3d83a06f23a34ec8cf27d585863135ebfd56a4f 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/detail/strided_memcpy.h"
-
 namespace paddle {
 namespace operators {
 
@@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   }
 }
 
+template <typename T>
+inline void StridedMemcpyWithAxis0(
+    const platform::DeviceContext& dev_ctx, const framework::Tensor& input,
+    const std::vector<const framework::Tensor*>& shape_refer,
+    std::vector<framework::Tensor*>* outputs) {
+  const framework::DDim in_stride = stride_numel(input.dims());
+  const int axis = 0;
+  size_t input_offset = 0;
+
+  for (size_t i = 0; i < outputs->size(); ++i) {
+    auto out_stride = stride_numel(shape_refer[i]->dims());
+    auto out = outputs->at(i);
+    if (out != nullptr) {
+      StridedNumelCopyWithAxis<T>(dev_ctx, axis, out->data<T>(), out_stride,
+                                  input.data<T>() + input_offset, in_stride,
+                                  out_stride[axis]);
+    }
+    input_offset += out_stride[axis];
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index fe7c7039c7dec714e265ede1b7167fd800ddc2f7..d19ac9839c90a116265b761e3b1b3f855e2d95e8 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
+    auto x_vars_name = ctx.Inputs("X");
 
     framework::LibraryType library{framework::LibraryType::kPlain};
     framework::DataLayout layout{framework::DataLayout::kAnyLayout};
@@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel {
 
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
       int dtype = -1;
-      for (auto& x_var : x_vars) {
-        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
-        if (lod_tensor.numel() == 0) {
+      for (size_t idx = 0; idx < x_vars.size(); ++idx) {
+        PADDLE_ENFORCE(x_vars[idx] != nullptr,
+                       "Input var[%s] should not be nullptr", x_vars_name[idx]);
+        // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
+        auto tensor = framework::GetTensorFromVar(*x_vars[idx]);
+        if (tensor->numel() == 0) {
           continue;
         }
         if (dtype == -1) {
-          dtype = framework::ToDataType(lod_tensor.type());
+          dtype = framework::ToDataType(tensor->type());
         } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
         }
       }
       PADDLE_ENFORCE_NE(dtype, -1,
@@ -132,7 +136,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 34403c7a7aa717cca470be2931009e219e00e3ae..f6e12dfc76c6ce73f10e707387f6a9cedacde3c8 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel<T> {
         out->mutable_data<T>(context.GetPlace());
       }
       auto result = EigenVector<T>::Flatten(*out);
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      int start = in_place ? 1 : 0;
       if (!in_place) {
-        math::SetConstant<DeviceContext, T> constant_functor;
-        constant_functor(context.template device_context<DeviceContext>(), out,
-                         0.0);
+        if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
+            in_vars[1]->IsType<framework::LoDTensor>()) {
+          auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
+          auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
+          if (in_0.numel() && in_1.numel()) {
+            auto in_0_e = EigenVector<T>::Flatten(in_0);
+            auto in_1_e = EigenVector<T>::Flatten(in_1);
+            result.device(place) = in_0_e + in_1_e;
+            start = 2;
+          }
+        }
+        if (start != 2) {
+          math::SetConstant<DeviceContext, T> constant_functor;
+          constant_functor(context.template device_context<DeviceContext>(),
+                           out, 0.0);
+        }
       }
 
       math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
       // If in_place, just skip the first tensor
-      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
+      for (size_t i = start; i < in_num; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
           auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
           if (in_t.numel() == 0) {
@@ -69,79 +83,54 @@ class SumKernel : public framework::OpKernel<T> {
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      std::unique_ptr<framework::SelectedRows> in0;
-      if (in_place) {
-        // If is in_place, we store the input[0] to in0
-        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
-        auto &rows = in_sel0.rows();
-#ifdef PADDLE_WITH_CUDA
-        std::vector<int64_t> rows_in_cpu;
-        rows_in_cpu.reserve(rows.size());
-        for (auto item : rows) {
-          rows_in_cpu.push_back(item);
-        }
-        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
-#else
-        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
-#endif
-        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      if (in_place && in_vars.size() < 2) {
+        return;
       }
 
-      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
-        if (i == 0 && in0) {
-          return *in0.get();
-        } else {
-          return in_vars[i]->Get<SelectedRows>();
+      std::vector<const paddle::framework::SelectedRows *> inputs;
+      SelectedRows temp_in0;
+
+      if (in_place) {
+        auto &in0 = in_vars[0]->Get<SelectedRows>();
+        temp_in0.set_height(in0.height());
+        temp_in0.set_rows(in0.rows());
+        framework::TensorCopy(in0.value(), in0.place(),
+                              context.device_context(),
+                              temp_in0.mutable_value());
+        inputs.push_back(&temp_in0);
+        for (size_t i = 1; i < in_vars.size(); ++i) {
+          auto &in = in_vars[i]->Get<SelectedRows>();
+          if (in.rows().size() > 0) {
+            inputs.push_back(&in);
+          }
         }
-      };
+      } else {
+        for (auto &in_var : in_vars) {
+          auto &in = in_var->Get<SelectedRows>();
+          if (in.rows().size() > 0) {
+            inputs.push_back(&in_var->Get<SelectedRows>());
+          }
+        }
+      }
 
       auto *out = context.Output<SelectedRows>("Out");
       out->mutable_rows()->clear();
-      auto *out_value = out->mutable_value();
 
-      // Runtime InferShape
-      size_t first_dim = 0;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        first_dim += sel_row.rows().size();
-      }
-
-      std::vector<int64_t> in_dim;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        if (sel_row.rows().size() > 0) {
-          in_dim = framework::vectorize(sel_row.value().dims());
+      bool has_data = false;
+      for (auto &in : inputs) {
+        if (in->rows().size() > 0) {
+          has_data = true;
           break;
         }
       }
-      if (in_dim.empty()) {
-        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim =
-            framework::vectorize(get_selected_row(in_num - 1).value().dims());
+      if (has_data) {
+        math::scatter::MergeAdd<DeviceContext, T> merge_add;
+        merge_add(context.template device_context<DeviceContext>(), inputs,
+                  out);
       } else {
-        in_dim[0] = static_cast<int64_t>(first_dim);
-      }
-
-      out_value->Resize(framework::make_ddim(in_dim));
-      out_value->mutable_data<T>(context.GetPlace());
-      // if all the input sparse vars are empty, no need to
-      // merge these vars.
-      if (first_dim == 0UL) {
-        return;
-      }
-
-      math::SelectedRowsAddTo<DeviceContext, T> functor;
-
-      int64_t offset = 0;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        if (sel_row.rows().size() == 0) {
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-        functor(context.template device_context<DeviceContext>(), sel_row,
-                offset, out);
-        offset += sel_row.value().numel();
+        // no data, just set a empty out tensor.
+        out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
+                                              context.GetPlace());
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4a8ac441cfaf642fde58ee30865a22e83c065498..c17d1afc309c65035063348d4934ea1783b018ed 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
 Top K operator
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 8e4a07556fb51dbb15ef948fcee120e2f68e089a..0cad224ca8860b0e4bc2e3f2bc1659235aadfe2d 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -262,31 +262,31 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
                              const T* src, int lds, int dim, int k,
                              int grid_dim, int num) {
   __shared__ Pair<T> sh_topk[BlockSize];
-  __shared__ int maxid[BlockSize / 2];
   const int tid = threadIdx.x;
   const int warp = threadIdx.x / 32;
 
   const int bid = blockIdx.x;
   for (int i = bid; i < num; i += grid_dim) {
-    output += i * output_stride;
-    indices += i * k;
-
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
     Pair<T> topk[MaxLength];
     int beam = MaxLength;
     Pair<T> max;
     bool is_empty = false;
     bool firststep = true;
 
-    for (int k = 0; k < MaxLength; k++) {
-      topk[k].set(-INFINITY, -1);
+    for (int j = 0; j < MaxLength; j++) {
+      topk[j].set(-INFINITY, -1);
     }
-    while (k) {
+    while (top_num) {
       ThreadGetTopK<T, MaxLength, BlockSize>(
           topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
 
       sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                           &indices, &beam, &k, tid, warp);
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp);
     }
   }
 }
@@ -327,13 +327,15 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     const T* input_data = input->data<T>();
-
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    size_t input_height = input->dims()[0];
-    size_t input_width = input->dims()[1];
+    framework::DDim inputdims = input->dims();
+    const size_t input_height = framework::product(
+        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
+    const size_t input_width = inputdims[inputdims.size() - 1];
+
     if (k > input_width) k = input_width;
 
     // NOTE: pass lds and dim same to input width.
@@ -342,14 +344,12 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     const int kMaxHeight = 2048;
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     auto& dev_ctx = ctx.cuda_device_context();
-
     switch (GetDesiredBlockDim(input_width)) {
       FIXED_BLOCK_DIM(
           KeMatrixTopK<T, 5,
                        kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-              output_data, output->dims()[1], indices_data, input_data,
-              input_width, input_width, static_cast<int>(k), gridx,
-              input_height));
+              output_data, k, indices_data, input_data, input_width,
+              input_width, static_cast<int>(k), gridx, input_height));
       default:
         PADDLE_THROW("Error");
     }
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 054dd481994d03f71b0ed5dc73e103085f6c91aa..76ece57b39919148da04caecaa43ea9d2b9d95df 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
-    // FIXME: only deal with matrix(2d tensor).
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
@@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel<T> {
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    auto eg_input = EigenMatrix<T>::From(*input);
-
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row = framework::product(
@@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel<T> {
     const size_t col = inputdims[inputdims.size() - 1];
     Eigen::DSizes<int, 2> flat2dims(row, col);
     // NOTE: eigen shape doesn't affect paddle tensor.
-    eg_input.reshape(flat2dims);
+    auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
 
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe..bbd71db6062107f6ba40343c84d942b54b3958e6 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker);
 REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose2,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index c1b5a8b31be243fab3af06a18c8e51986c953700..b4025350fa9f3610bde43eee91cd059f3063813f 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -16,15 +16,18 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+    transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index aa907595cb7cf165974caa69fe8eb0370471732d..e3132ae76f624f3338d749e4fcebbd0ecd7ffe79 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -29,7 +29,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int>>("shape");
+      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
       tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -94,7 +94,7 @@ This operator initializes a tensor with random values sampled from a
 uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<std::vector<int64_t>>("shape", "The shape of the output tensor");
     AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
     AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index bbb692b0ddfc18e8a62c0d2a6bac88f9932f6704..2bb0ecc139f7096d1b61150e0a2d4fb095338749 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -48,7 +48,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = context.Attr<std::vector<int>>("shape");
+      auto shape = context.Attr<std::vector<int64_t>>("shape");
       tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
     } else {
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 2880c09263f10e9c624e11b77188171f48d9db28..b5f472d20f40fa182a4aa55ff384b0954e4ba9e3 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
       return cpu.has(Cpu::tAVX);
     case avx2:
       return cpu.has(Cpu::tAVX2);
-    case avx512_common:
+    case avx512f:
       return cpu.has(Cpu::tAVX512F);
     case avx512_core:
       return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 30c8fbcfce92a8b06a175ddf198cde572f72b2a4..6810a1651a14cdb2080af846b21cad242b70bf35 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -43,7 +43,7 @@ typedef enum {
   sse42,
   avx,
   avx2,
-  avx512_common,
+  avx512f,
   avx512_core,
   avx512_core_vnni,
   avx512_mic,
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index bb8b14bb9fa41942c3aa653ca224c0842fbf9a00..1ad66f0525e4857533d5cf5ef3ee041aec297794 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -341,6 +341,28 @@ class ScopedPoolingDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
 
+class ScopedSpatialTransformerDescriptor {
+ public:
+  ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
+  }
+  ~ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
+                                                        const int dimA[]) {
+    PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor(
+        desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
+    return desc_;
+  }
+
+ private:
+  cudnnSpatialTransformerDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
+};
+
 inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
   bool use_cudnn = ctx.Attr<bool>("use_cudnn");
   use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4286242b2a93d7046e7349a99d1d1a09dca09113..924810bd61841139bc1849a000aaa57b07b71f0c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,13 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
         "'Place' is not supported, Please re-compile with WITH_GPU "
         "option");
   }
-  return it->second.get();
+  return it->second.get().get();
+}
+
+template <typename DevCtx, typename PlaceType>
+inline void EmplaceDeviceContext(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        map_ptr,
+    platform::Place p) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
+                     // lazy evaluation. i.e., only create device context at
+                     // first `Get`
+                     return PtrType(new DevCtx(boost::get<PlaceType>(p)));
+                   }));
 }
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
-  using PtrType = std::unique_ptr<DeviceContext>;
   std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
@@ -47,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(
-          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
+      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
 #else
-      device_contexts_.emplace(
-          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
+      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
 #endif
     } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
-          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
+      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -64,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
-          p,
-          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
+      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
+          &device_contexts_, p);
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -286,38 +294,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() {
+  p_blobmap_.reset(new BlobMap());
+  p_mutex_.reset(new std::mutex());
 }
 
+namespace {
+// Current thread's id.
+thread_local int cur_thread_id = 0;
+}
+
+void set_cur_thread_id(int tid) { cur_thread_id = tid; }
+int get_cur_thread_id(void) { return cur_thread_id; }
+
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
+
+  int tid = platform::get_cur_thread_id();
+
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
 
-  auto it = p->find(name);
+  // Find KeyBlob for current thread
+  auto map_it = pMap->find(tid);
 
-  if (it == p->end()) {
-    (*p)[name] = data;  // create new blob
+  if (map_it == pMap->end()) {
+    // 1st time to set blob in current thread
+    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
+    (*pMap)[tid] = pBlob;
   } else {
-    it->second = data;  // set data to existing blob
+    pBlob = map_it->second;
   }
 
+  // Find Key in found (or newly created) KeyBlob
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) {
+    (*pBlob)[name] = data;  // create new blob
+  } else {
+    key_it->second = data;  // set data to existing blob
+  }
+
+  // lock will be automatically released when out of scope
   return;
 }
 
 std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
 
-  auto it = p->find(name);
+  int tid = platform::get_cur_thread_id();
 
-  if (it != p->end()) {
-    return it->second;
-  }
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+
+  // Find KeyBlob for current thread firstly
+  auto map_it = pMap->find(tid);
+  if (map_it == pMap->end()) return nullptr;
+  pBlob = map_it->second;
+
+  // Find Blob via name
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) return nullptr;
 
-  return nullptr;
+  // lock will be automatically released when out of scope
+  return key_it->second;
 }
 
 #endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e1ff1a1746952de5aa4bead361b50af4e99bc9bc..0240b9380f3213b2a030061007e04abe1d73c6e3 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -176,6 +177,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
+
+void set_cur_thread_id(int);
+int get_cur_thread_id(void);
+
 class MKLDNNDeviceContext : public CPUDeviceContext {
  public:
   explicit MKLDNNDeviceContext(CPUPlace place);
@@ -191,8 +198,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
  private:
   mkldnn::engine engine_;
-  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      p_blobs_;
+  std::shared_ptr<BlobMap> p_blobmap_;
+  std::shared_ptr<std::mutex> p_mutex_;
 };
 #endif
 
@@ -228,7 +235,8 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
+      device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index e6353f67ef118072a2d8e49111e8ecc486589998..d3d754b6f58d25a9dfacafaf55d50b353a71ee6d 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
-  __macro(cudnnSetTensor4dDescriptor);              \
-  __macro(cudnnSetTensor4dDescriptorEx);            \
-  __macro(cudnnSetTensorNdDescriptor);              \
-  __macro(cudnnGetTensorNdDescriptor);              \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);     \
-  __macro(cudnnCreateTensorDescriptor);             \
-  __macro(cudnnDestroyTensorDescriptor);            \
-  __macro(cudnnCreateFilterDescriptor);             \
-  __macro(cudnnSetFilter4dDescriptor);              \
-  __macro(cudnnSetFilterNdDescriptor);              \
-  __macro(cudnnGetFilterNdDescriptor);              \
-  __macro(cudnnSetPooling2dDescriptor);             \
-  __macro(cudnnSetPoolingNdDescriptor);             \
-  __macro(cudnnGetPoolingNdDescriptor);             \
-  __macro(cudnnDestroyFilterDescriptor);            \
-  __macro(cudnnCreateConvolutionDescriptor);        \
-  __macro(cudnnCreatePoolingDescriptor);            \
-  __macro(cudnnDestroyPoolingDescriptor);           \
-  __macro(cudnnSetConvolution2dDescriptor);         \
-  __macro(cudnnDestroyConvolutionDescriptor);       \
-  __macro(cudnnSetConvolutionNdDescriptor);         \
-  __macro(cudnnGetConvolutionNdDescriptor);         \
-  __macro(cudnnDeriveBNTensorDescriptor);           \
-  __macro(cudnnCreate);                             \
-  __macro(cudnnDestroy);                            \
-  __macro(cudnnSetStream);                          \
-  __macro(cudnnActivationForward);                  \
-  __macro(cudnnConvolutionForward);                 \
-  __macro(cudnnConvolutionBackwardBias);            \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
-  __macro(cudnnTransformTensor);                    \
-  __macro(cudnnPoolingForward);                     \
-  __macro(cudnnPoolingBackward);                    \
-  __macro(cudnnSoftmaxBackward);                    \
-  __macro(cudnnSoftmaxForward);                     \
-  __macro(cudnnGetVersion);                         \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
+  __macro(cudnnSetTensor4dDescriptor);               \
+  __macro(cudnnSetTensor4dDescriptorEx);             \
+  __macro(cudnnSetTensorNdDescriptor);               \
+  __macro(cudnnGetTensorNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
+  __macro(cudnnGetConvolutionForwardAlgorithm);      \
+  __macro(cudnnCreateTensorDescriptor);              \
+  __macro(cudnnDestroyTensorDescriptor);             \
+  __macro(cudnnCreateFilterDescriptor);              \
+  __macro(cudnnSetFilter4dDescriptor);               \
+  __macro(cudnnSetFilterNdDescriptor);               \
+  __macro(cudnnGetFilterNdDescriptor);               \
+  __macro(cudnnSetPooling2dDescriptor);              \
+  __macro(cudnnSetPoolingNdDescriptor);              \
+  __macro(cudnnGetPoolingNdDescriptor);              \
+  __macro(cudnnDestroyFilterDescriptor);             \
+  __macro(cudnnCreateConvolutionDescriptor);         \
+  __macro(cudnnCreatePoolingDescriptor);             \
+  __macro(cudnnDestroyPoolingDescriptor);            \
+  __macro(cudnnSetConvolution2dDescriptor);          \
+  __macro(cudnnDestroyConvolutionDescriptor);        \
+  __macro(cudnnSetConvolutionNdDescriptor);          \
+  __macro(cudnnGetConvolutionNdDescriptor);          \
+  __macro(cudnnDeriveBNTensorDescriptor);            \
+  __macro(cudnnCreateSpatialTransformerDescriptor);  \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
+  __macro(cudnnDestroySpatialTransformerDescriptor); \
+  __macro(cudnnSpatialTfGridGeneratorForward);       \
+  __macro(cudnnSpatialTfGridGeneratorBackward);      \
+  __macro(cudnnSpatialTfSamplerForward);             \
+  __macro(cudnnSpatialTfSamplerBackward);            \
+  __macro(cudnnCreate);                              \
+  __macro(cudnnDestroy);                             \
+  __macro(cudnnSetStream);                           \
+  __macro(cudnnActivationForward);                   \
+  __macro(cudnnConvolutionForward);                  \
+  __macro(cudnnConvolutionBackwardBias);             \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
+  __macro(cudnnTransformTensor);                     \
+  __macro(cudnnPoolingForward);                      \
+  __macro(cudnnPoolingBackward);                     \
+  __macro(cudnnSoftmaxBackward);                     \
+  __macro(cudnnSoftmaxForward);                      \
+  __macro(cudnnGetVersion);                          \
   __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f04395a8ac00f33501008aa12f22773ddda9b138..a251bfcd9914422cb6300adbbcdef3dfa79f441c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) (condition == 0)
 #endif
 
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition != 0)
+#endif
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 4c99f4be321160caf0ee2f89a655bdfb933408e3..ab91ca5345047f3053eb8771e6a265d2a3011f85 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 
-  if (platform::jit::MayIUse(platform::jit::avx512_common)) {
+  if (platform::jit::MayIUse(platform::jit::avx512f)) {
 #ifndef __AVX512F__
     LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
 #endif
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 612f3bc0e7940663f84a55b2c4395a7b5119d5bb..da46a1abe12258b47b2fd4afb5f146daf15e026d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/string/printf.h"
 
+DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
+
 namespace paddle {
 namespace platform {
 
@@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() {
   PopEvent(name_, dev_ctx_);
 }
 
+RecordRPCEvent::RecordRPCEvent(const std::string& name,
+                               const DeviceContext* dev_ctx) {
+  if (FLAGS_enable_rpc_profiler) {
+    event_.reset(new platform::RecordEvent(name, dev_ctx));
+  }
+}
+
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
   std::lock_guard<std::mutex> l(profiler_mu);
@@ -370,8 +379,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
   std::vector<std::vector<Event>> merged_events_list;
   if (merge_thread) {
     std::vector<Event> merged_events;
-    for (int i = 0; i < events.size(); ++i) {
-      for (int j = 0; j < events[i].size(); ++j) {
+    for (size_t i = 0; i < events.size(); ++i) {
+      for (size_t j = 0; j < events[i].size(); ++j) {
         merged_events.push_back(events[i][j]);
       }
     }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 38630686f7cf3c669373f941d989adf11ba6cfe6..e8eae874afa3d17f0d3374eef457cdbacb3f8424 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 #if !defined(_WIN32)
 struct RecordEvent {
+  // dev_ctx can be set to nullptr if device is cpu.
   RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
 
   ~RecordEvent();
@@ -86,6 +87,16 @@ struct RecordEvent {
   std::string full_name_;
 };
 
+class RecordRPCEvent {
+ public:
+  // dev_ctx can be set to nullptr if device is cpu.
+  RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
+  ~RecordRPCEvent() {}
+
+ private:
+  std::unique_ptr<RecordEvent> event_;
+};
+
 struct RecordBlock {
   explicit RecordBlock(int block_id);
   ~RecordBlock();
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 3b22718a8c6f994dbc2dc3e7aaa19a7163f716ba..d3b0d4a22954c1d67dc9551b997dcffa0625cbeb 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -57,6 +57,18 @@ struct variant_caster<V<Ts...>> {
     auto caster = make_caster<T>();
     if (!load_success_ && caster.load(src, convert)) {
       load_success_ = true;
+
+      if (std::is_same<T, std::vector<float>>::value) {
+        auto caster_ints = make_caster<std::vector<int64_t>>();
+        if (caster_ints.load(src, convert)) {
+          VLOG(4) << "This value are floats and int64_ts satisfy "
+                     "simultaneously, will set it's type to "
+                     "std::vector<int64_t>";
+          value = cast_op<std::vector<int64_t>>(caster_ints);
+          return true;
+        }
+      }
+
       value = cast_op<T>(caster);
       return true;
     }
@@ -259,6 +271,8 @@ void BindOpDesc(pybind11::module *m) {
   pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
       .value("INT", pd::proto::AttrType::INT)
       .value("INTS", pd::proto::AttrType::INTS)
+      .value("LONG", pd::proto::AttrType::LONG)
+      .value("LONGS", pd::proto::AttrType::LONGS)
       .value("FLOAT", pd::proto::AttrType::FLOAT)
       .value("FLOATS", pd::proto::AttrType::FLOATS)
       .value("STRING", pd::proto::AttrType::STRING)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1709546fae288e84ce2b6fef4bfd1bd68da84b1d..7c7b14df6618bd636f3636612486884b573309fb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -57,6 +57,10 @@ limitations under the License. */
 
 #include "pybind11/stl.h"
 
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
                                return make_ddim(shape);
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims);
+              holder->InitOnce(capacity, dims,
+                               FLAGS_reader_queue_speed_test_mode);
               return holder->GetQueue();
             },
         py::return_value_policy::copy);
@@ -640,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
-      .def("set_str", [](ir::Pass &self, const std::string &name,
-                         const std::string &attr) {
-        self.Set<std::string>(name, new std::string(attr));
+      .def(
+          "set_str",
+          [](ir::Pass &self, const std::string &name, const std::string &attr) {
+            self.Set<std::string>(name, new std::string(attr));
+          })
+      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
+        self.Set<const int>(name, new int(val));
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
@@ -812,6 +821,13 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, bool b) {
             self.enable_data_balance_ = b;
           })  // FIXME(chengudo): enable_data_balance seems not important
+      .def_property("enable_sequential_execution",
+                    [](const BuildStrategy &self) {
+                      return self.enable_sequential_execution_;
+                    },
+                    [](BuildStrategy &self, bool b) {
+                      self.enable_sequential_execution_ = b;
+                    })
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index 78d6e5ff554b9cd9facae85be166a697e0b75337..eabb51d370aff709e289e1fc727aa2dbb551d82e 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
 
 add_executable(demo_trainer demo_trainer.cc)
@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer
         ${ARCHIVE_END}
         ${MATH_LIB}
         ${MKLDNN_LIB}
-        glog gflags protobuf snappystream snappy z
+        glog gflags protobuf snappystream snappy z xxhash
         ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index 41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a..191da20669e185d819ec5eed55427461cc0b10e4 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
          -DWITH_MKL=OFF \
          -DWITH_MKLDNN=OFF
 make -j8
-make -j8 inference_lib_dist
+make -j8 fluid_lib_dist
 ```
 
 ### step 2. generate program desc
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b97e63ecc85872ebccb21f37bb4f2f00d49b4599..d7676f89ab5e781f910f98d03e72d5f7c1023a9a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -95,9 +95,9 @@ function cmake_gen() {
                 exit 1
             fi
         fi
-    else 
+    else
         if [ "$1" != "" ]; then
-            echo "using python abi: $1"     
+            echo "using python abi: $1"
             if [ "$1" == "cp27-cp27m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
                 export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
@@ -119,7 +119,7 @@ function cmake_gen() {
            fi
         fi
     fi
-    
+
     if [ "$SYSTEM" == "Darwin" ]; then
         WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
         WITH_AVX=${WITH_AVX:-ON}
@@ -127,7 +127,7 @@ function cmake_gen() {
     else
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
-    
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -147,13 +147,11 @@ function cmake_gen() {
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
@@ -181,12 +179,10 @@ EOF
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
@@ -390,10 +386,12 @@ function run_mac_test() {
     Running unit tests ...
     ========================================
 EOF
-
+        #remove proxy here to fix dist error on mac
+        export http_proxy=
+        export https_proxy=
         # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $1     
-        # make install should also be test when unittest 
+        ctest --output-on-failure -j $1
+        # make install should also be test when unittest
         make install -j 8
         pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
@@ -648,42 +646,47 @@ function gen_capi_package() {
     fi
 }
 
-function gen_fluid_inference_lib() {
+function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
-    Generating fluid inference library ...
+    Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF
+        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
+        make -j `nproc` fluid_lib_dist
         make -j `nproc` inference_lib_dist
       fi
 }
 
-function tar_fluid_inference_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+function tar_fluid_lib() {
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
-    Taring fluid inference library ...
+    Taring fluid library for train and inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
         tar -czf fluid.tgz fluid
+        cp -r fluid_inference_install_dir fluid_inference
+        tar -czf fluid_inference.tgz fluid_inference
       fi
 }
 
-function test_fluid_inference_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+function test_fluid_lib() {
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
-    Testing fluid inference library ...
+    Testing fluid library for inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
+                 ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
+                 ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
         ./clean.sh
       fi
 }
@@ -731,9 +734,9 @@ function main() {
         ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_inference_lib
-        tar_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        tar_fluid_lib
+        test_fluid_lib
         ;;
       check_style)
         check_style
@@ -744,8 +747,8 @@ function main() {
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
         gen_capi_package
-        gen_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        test_fluid_lib
         assert_api_spec_approvals
         ;;
       maccheck)
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 9c02e0f41b04e113251e0fda72ca8abd976ab6f7..4a0c1f8cb663ec105030ac2c5a70c5f906cf6d12 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -78,7 +78,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
                     six.iteritems(word_dict), key=lambda x: x[1],
                     reverse=True)):
             if idx + 3 == dict_size: break
-            fout.write("%s\n" % (word[0]))
+            fout.write("%s\n" % (cpt.to_bytes(word[0])))
 
 
 def __load_dict(tar_file, dict_size, lang, reverse=False):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7bbdf7de89cc932e0023952e3c8e102f92b06855..737c8be8147a7efaf9b89827f063430146d3c078 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,12 +113,17 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
         read_env_flags.append('rpc_server_profile_period')
         read_env_flags.append('rpc_server_profile_path')
+        read_env_flags.append('enable_rpc_profiler')
+        read_env_flags.append('rpc_send_thread_num')
+        read_env_flags.append('rpc_get_thread_num')
+        read_env_flags.append('rpc_prefetch_thread_num')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 4c24d0d6a7069c75c7b9b8245f4567ae8bfc2742..1738afe93e99f1de28bec2fb23be8e1a309d9288 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -272,7 +272,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                 )
 
         square = grad * grad
-        local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
+        local_norm_var = layers.reduce_sum(input=square)
         context[self.group_name].append(local_norm_var)
 
         self.context = context
@@ -282,7 +282,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         if group_scale_name not in self.context:
             group_norm_var = layers.sums(input=self.context[self.group_name])
             group_norm_var = layers.sqrt(x=group_norm_var)
-            group_norm_var = layers.cast(group_norm_var, 'float32')
             clip_var = self.context[self.group_name + "_clip"]
             group_scale_var = layers.elementwise_div(
                 x=clip_var,
@@ -333,7 +332,8 @@ def append_gradient_clip_ops(param_grads):
     for p, g in param_grads:
         if g is None:
             continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_clip'):
             clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
             if clip_attr is None:
                 clip_attr = NullGradientClipAttr()
@@ -348,7 +348,8 @@ def append_gradient_clip_ops(param_grads):
     for p, g in param_grads:
         if g is None:
             continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_graident_clip'):
             res.append(clip_attr._create_operators(param=p, grad=g))
 
     return res
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 7a82038ff78b17b2ddfd7b47320d41a7de9a2b8a..c84dd4bc4751df6cb922e13593d8a07e71a9b9d5 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -316,7 +316,7 @@ class DetectionMAP(Evaluator):
         gt_label (Variable): The ground truth label index, which is a LoDTensor
             with shape [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
-            LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
         gt_difficult (Variable|None): Whether this ground truth is a difficult
             bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
             it means all the ground truth labels are not difficult bbox.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5f3111f363ccc14de4dd3f067097a19eabb83662..fd03dff386cad21c727ca0f266fa1b37ad65b4ad 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1496,6 +1496,9 @@ class Program(object):
             >>> with program._optimized_guard([p,g]):
             >>>     p = p - 0.001 * g
         """
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
+
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
         self._op_role_var = [
@@ -1503,11 +1506,11 @@ class Program(object):
             for var in param_and_grads
         ]
         yield
-        self._op_role_var = []
-        self._current_role = OpRole.Forward
+        self._op_role_var = tmp_var
+        self._current_role = tmp_role
 
     @contextlib.contextmanager
-    def _lr_schedule_guard(self):
+    def _lr_schedule_guard(self, is_with_opt=False):
         """
         A with guard to set :code:`LRSched` :code:`OpRole` and
         :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
@@ -1515,6 +1518,10 @@ class Program(object):
 
         Notes: This is a very low level API. Users should not use it directly.
 
+        Args:
+            is_with_opt: Only set to true if these ops a in the middle
+                 of a bunch of optimize ops so that it can be treated
+                 correctly. For example, sgd->lr_op->sgd->lr_op->sgd.
 
         Examples:
 
@@ -1522,13 +1529,19 @@ class Program(object):
             >>> with program.lr_schedule_guard():
             >>>     lr = lr * decay
         """
+
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
+
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.LRSched
+        if is_with_opt:
+            self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
         # TODO(typhoonzero): how to set target learning rate var
         self._op_role_var = []
         yield
-        self._op_role_var = []
-        self._current_role = OpRole.Forward
+        self._op_role_var = tmp_var
+        self._current_role = tmp_role
 
     def __str__(self):
         """
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 604f3eacd75beff306915b224b30c369dd3a486f..22c60c1cbe4faa8577fa655766e42694652e498d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
 
     load_prog = Program()
     load_block = load_prog.global_block()
+    need_delete_vars = []
 
     for var_tuple in slice_vars_and_attrs:
         orig_var = var_tuple[0]
         start = var_tuple[1]
         slice_var = var_tuple[2]
-        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+        end = start + slice_var.shape[0]
 
         clone_orig_var = load_block.create_var(
             name=orig_var.name,
@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
             attrs={'axes': [0],
                    'starts': [start],
                    'ends': [end]})
-
+        need_delete_vars.append(clone_orig_var)
+    load_block.append_op(
+        type='delete_var',
+        inputs={'X': need_delete_vars}, )
     executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index bd9727b6ac0208b199091db00bd0fd5fae74d53b..dc317de9abbd06f4021e64b87ea88ba6af8809c9 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -324,10 +324,19 @@ class LayerHelper(object):
             raise ValueError("no Parameter name %s found" % name)
         return param
 
-    def create_tmp_variable(self, dtype, stop_gradient=False):
+    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+        """Create a temporary variable that should be type inferred layer.
+
+        Note:
+            The default type will be set to LOD_TENSOR. However, when
+            the var is used as operator output, its type will be updated
+            based on operator's `VarTypeInference` implementation in
+            infer_var_type.
+        """
         return self.main_program.current_block().create_var(
             name=unique_name.generate(".".join([self.name, 'tmp'])),
             dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=stop_gradient)
 
@@ -388,7 +397,7 @@ class LayerHelper(object):
 
         b = self.create_parameter(
             attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
             inputs={'X': [input_var],
@@ -414,7 +423,7 @@ class LayerHelper(object):
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
         if not core.IsInplace(act_type):
-            tmp = self.create_tmp_variable(dtype=input_var.dtype)
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index e868ff8b6aeb5a7be1eb6a6b9ff94f8560277a5f..9730fbf510cbe8c323b761b29821710f2c14a81d 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -80,8 +80,8 @@ def split_lod_tensor(input, mask, level=0):
 
     """
     helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.dtype)
-    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='split_lod_tensor',
         inputs={
@@ -131,7 +131,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
     """
     helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
     helper.append_op(
         type='merge_lod_tensor',
         inputs={'X': x,
@@ -524,7 +524,7 @@ class StaticRNN(object):
         if not isinstance(o, Variable):
             raise TypeError("step output takes a Variable")
 
-        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
         self.helper.append_op(
             type='rnn_memory_helper',
             inputs={'X': [o]},
@@ -606,7 +606,8 @@ class StaticRNN(object):
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+            new_mem = self.helper.create_variable_for_type_inference(
+                dtype=mem_var.dtype)
 
             rnn_block.append_op(
                 type='rnn_memory_helper',
@@ -813,7 +814,7 @@ def max_sequence_len(rank_table):
         ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_tmp_variable(dtype="int64")
+    res = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="max_sequence_len",
         inputs={"RankTable": rank_table},
@@ -884,7 +885,7 @@ def array_to_lod_tensor(x, table):
           lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
     """
     helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="array_to_lod_tensor",
         inputs={'X': x,
@@ -915,7 +916,7 @@ def increment(x, value=1.0, in_place=True):
     """
     helper = LayerHelper("increment", **locals())
     if not in_place:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = x
     helper.append_op(
@@ -1012,7 +1013,7 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
     attrs = dict()
@@ -1051,7 +1052,7 @@ def equal(x, y, cond=None, **ignored):
     """
     helper = LayerHelper("equal", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
     helper.append_op(
@@ -1098,7 +1099,7 @@ def array_read(array, i):
             array,
             Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.dtype)
+    out = helper.create_variable_for_type_inference(dtype=array.dtype)
     helper.append_op(
         type='read_from_array',
         inputs={'X': [array],
@@ -1133,7 +1134,7 @@ def shrink_memory(x, i, table):
         usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='shrink_rnn_memory',
         inputs={'X': [x],
@@ -1170,7 +1171,7 @@ def array_length(array):
 
     """
     helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp = helper.create_variable_for_type_inference(dtype='int64')
     tmp.stop_gradient = True
     helper.append_op(
         type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
@@ -1267,7 +1268,7 @@ class ConditionalBlock(object):
         ]
 
         step_scope = parent_block.create_var(
-            name='control_scope', type=core.VarDesc.VarType.STEP_SCOPES)
+            type=core.VarDesc.VarType.STEP_SCOPES)
         parent_block.append_op(
             type='conditional_block',
             inputs={
@@ -1585,12 +1586,11 @@ class DynamicRNN(object):
         self.lod_rank_table = None
         self.max_seq_len = None
         self.step_idx = None
-        self.zero_idx = fill_constant(
-            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self.zero_idx = None
         self.mem_dict = dict()
         self.output_array = []
         self.outputs = []
-        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond = self.helper.create_variable_for_type_inference(dtype='bool')
         self.cond.stop_gradient = False
         self.while_op = While(self.cond)
         self.input_array = []
@@ -1791,6 +1791,7 @@ class DynamicRNN(object):
 
         """
         self._assert_in_rnn_block_('memory')
+        self._init_zero_idx_()
         if init is not None:
             if not isinstance(init, Variable):
                 raise TypeError(
@@ -1904,6 +1905,22 @@ class DynamicRNN(object):
             array_write(x=each, i=self.step_idx, array=outside_array)
             self.output_array.append(outside_array)
 
+    def _init_zero_idx_(self):
+        if self.zero_idx is None:
+            parent_block = self._parent_block_()
+            self.zero_idx = parent_block.create_var(
+                name=unique_name.generate('zero_idx'), dtype='int64')
+            parent_block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [self.zero_idx]},
+                attrs={
+                    'shape': [1],
+                    'dtype': self.zero_idx.dtype,
+                    'value': float(0),
+                    'force_cpu': True
+                })
+
     def _parent_block_(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
@@ -1924,7 +1941,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     helper.is_instance('x', Variable)
     helper.is_instance('rank_table', Variable)
 
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reorder_lod_tensor_by_rank',
         inputs={'X': [x],
@@ -1958,7 +1975,7 @@ def is_empty(x, cond=None, **ignored):
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
     elif not isinstance(cond, Variable):
         raise TypeError("cond takes a variable")
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1cfcbbb9c1614f21848e62cce79befc673e1739c..4ac94981a7a47530fe6ae4d968212c62dd3e0a93 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -116,8 +116,8 @@ def rpn_target_assign(bbox_pred,
     Returns:
         tuple:
                A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox) is returned. The predicted_scores and
-               predicted_location is the predicted result of the RPN.
+               target_bbox, bbox_inside_weight) is returned. The predicted_scores 
+               and predicted_location is the predicted result of the RPN.
                The target_label and target_bbox is the ground truth,
                respectively. The predicted_location is a 2D Tensor with shape
                [F, 4], and the shape of target_bbox is same as the shape of
@@ -126,6 +126,8 @@ def rpn_target_assign(bbox_pred,
                [F + B, 1], and the shape of target_label is same as the shape
                of the predicted_scores, B is the number of the background
                anchors, the F and B is depends on the input of this operator.
+               Bbox_inside_weight represents whether the predicted loc is fake_fg
+               or not and the shape is [F, 4].
 
     Examples:
         .. code-block:: python
@@ -138,7 +140,7 @@ def rpn_target_assign(bbox_pred,
                           append_batch_size=False, dtype='float32')
         gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        loc_pred, score_pred, loc_target, score_target =
+        loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
             fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
                                           cls_logits=cls_logits,
                                           anchor_box=anchor_box,
@@ -147,10 +149,13 @@ def rpn_target_assign(bbox_pred,
 
     helper = LayerHelper('rpn_target_assign', **locals())
     # Assign target label to anchors
-    loc_index = helper.create_tmp_variable(dtype='int32')
-    score_index = helper.create_tmp_variable(dtype='int32')
-    target_label = helper.create_tmp_variable(dtype='int32')
-    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    loc_index = helper.create_variable_for_type_inference(dtype='int32')
+    score_index = helper.create_variable_for_type_inference(dtype='int32')
+    target_label = helper.create_variable_for_type_inference(dtype='int32')
+    target_bbox = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
+    bbox_inside_weight = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
     helper.append_op(
         type="rpn_target_assign",
         inputs={
@@ -163,7 +168,8 @@ def rpn_target_assign(bbox_pred,
             'LocationIndex': loc_index,
             'ScoreIndex': score_index,
             'TargetLabel': target_label,
-            'TargetBBox': target_bbox
+            'TargetBBox': target_bbox,
+            'BBoxInsideWeight': bbox_inside_weight
         },
         attrs={
             'rpn_batch_size_per_im': rpn_batch_size_per_im,
@@ -178,13 +184,14 @@ def rpn_target_assign(bbox_pred,
     score_index.stop_gradient = True
     target_label.stop_gradient = True
     target_bbox.stop_gradient = True
+    bbox_inside_weight.stop_gradient = True
 
     cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
     bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
     predicted_cls_logits = nn.gather(cls_logits, score_index)
     predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
 
-    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight
 
 
 def detection_output(loc,
@@ -282,7 +289,8 @@ def detection_output(loc,
     scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    nmsed_outs = helper.create_variable_for_type_inference(
+        dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -314,7 +322,7 @@ def iou_similarity(x, y, name=None):
     """
     helper = LayerHelper("iou_similarity", **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -351,7 +359,8 @@ def box_coder(prior_box,
     helper = LayerHelper("box_coder", **locals())
 
     if name is None:
-        output_box = helper.create_tmp_variable(dtype=prior_box.dtype)
+        output_box = helper.create_variable_for_type_inference(
+            dtype=prior_box.dtype)
     else:
         output_box = helper.create_variable(
             name=name, dtype=prior_box.dtype, persistable=False)
@@ -382,7 +391,7 @@ def polygon_box_transform(input, name=None):
     """
     helper = LayerHelper("polygon_box_transform", **locals())
     if name is None:
-        output = helper.create_tmp_variable(dtype=input.dtype)
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
     else:
         output = helper.create_variable(
             name=name, dtype=prior_box.input, persistable=False)
@@ -450,7 +459,7 @@ def detection_map(detect_res,
     helper = LayerHelper("detection_map", **locals())
 
     def __create_var(type):
-        return helper.create_tmp_variable(dtype=type)
+        return helper.create_variable_for_type_inference(dtype=type)
 
     map_out = __create_var('float32')
     accum_pos_count_out = out_states[0] if out_states else __create_var('int32')
@@ -557,8 +566,9 @@ def bipartite_match(dist_matrix,
         >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
-    match_indices = helper.create_tmp_variable(dtype='int32')
-    match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype)
+    match_indices = helper.create_variable_for_type_inference(dtype='int32')
+    match_distance = helper.create_variable_for_type_inference(
+        dtype=dist_matrix.dtype)
     helper.append_op(
         type='bipartite_match',
         inputs={'DistMat': dist_matrix},
@@ -644,8 +654,8 @@ def target_assign(input,
                             gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    out_weight = helper.create_tmp_variable(dtype='float32')
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out_weight = helper.create_variable_for_type_inference(dtype='float32')
     helper.append_op(
         type='target_assign',
         inputs={
@@ -816,9 +826,10 @@ def ssd_loss(location,
     conf_loss = nn.reshape(
         x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
     conf_loss.stop_gradient = True
-    neg_indices = helper.create_tmp_variable(dtype='int32')
+    neg_indices = helper.create_variable_for_type_inference(dtype='int32')
     dtype = matched_indices.dtype
-    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
+    updated_matched_indices = helper.create_variable_for_type_inference(
+        dtype=dtype)
     helper.append_op(
         type='mine_hard_examples',
         inputs={
@@ -998,8 +1009,8 @@ def prior_box(input,
             max_sizes = [max_sizes]
         attrs['max_sizes'] = max_sizes
 
-    box = helper.create_tmp_variable(dtype)
-    var = helper.create_tmp_variable(dtype)
+    box = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="prior_box",
         inputs={"Input": input,
@@ -1337,8 +1348,8 @@ def anchor_generator(input,
         'offset': offset
     }
 
-    anchor = helper.create_tmp_variable(dtype)
-    var = helper.create_tmp_variable(dtype)
+    anchor = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="anchor_generator",
         inputs={"Input": input},
@@ -1384,7 +1395,7 @@ def roi_perspective_transform(input,
     """
     helper = LayerHelper('roi_perspective_transform', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="roi_perspective_transform",
         inputs={"X": input,
@@ -1413,16 +1424,49 @@ def generate_proposal_labels(rpn_rois,
                              use_random=True):
     """
     ** Generate proposal labels Faster-RCNN **
-    TODO(buxingyuan): Add Document
+    This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
+    to sample foreground boxes and background boxes, and compute loss target.
+
+    RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
+    were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
+    If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
+    If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
+    then it was considered as a background sample.
+    After all foreground and background boxes are chosen (so called Rois),
+    then we apply random sampling to make sure
+    the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
+
+    For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
+    Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
+
+    Args:
+        rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth.
+        is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd.
+        gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
+        im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale.
+
+        batch_size_per_im(int): Batch size of rois per images.
+        fg_fraction(float): Foreground fraction in total batch_size_per_im.
+        fg_thresh(float): Overlap threshold which is used to chose foreground sample.
+        bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample.
+        bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample.
+        bbox_reg_weights(list|tuple): Box regression weights.
+        class_nums(int): Class number.
+        use_random(bool): Use random sampling to choose foreground and background boxes.
     """
 
     helper = LayerHelper('generate_proposal_labels', **locals())
 
-    rois = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype)
-    bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+    rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype)
+    labels_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    bbox_targets = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
+    bbox_inside_weights = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
+    bbox_outside_weights = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
 
     helper.append_op(
         type="generate_proposal_labels",
@@ -1472,7 +1516,7 @@ def generate_proposals(scores,
                        eta=1.0,
                        name=None):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate proposal Faster-RCNN **
 	
 	This operation proposes RoIs according to each box with their probability to be a foreground object and 
 	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
@@ -1504,8 +1548,10 @@ def generate_proposals(scores,
     """
     helper = LayerHelper('generate_proposals', **locals())
 
-    rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype)
-    rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype)
+    rpn_rois = helper.create_variable_for_type_inference(
+        dtype=bbox_deltas.dtype)
+    rpn_roi_probs = helper.create_variable_for_type_inference(
+        dtype=scores.dtype)
     helper.append_op(
         type="generate_proposals",
         inputs={
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index dcd5a064a85cbfeedf41b7dc122cacba987e6570..95e13669ad927cdda56153c7fe58325f4a3ec474 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -954,7 +954,7 @@ def read_file(reader):
     """
     helper = LayerHelper('read_file')
     out = [
-        helper.create_tmp_variable(
+        helper.create_variable_for_type_inference(
             stop_gradient=True, dtype='float32')
         for _ in range(len(reader.desc.shapes()))
     ]
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 8c11921d9bde0920f33368837302d39f36f45556..eea0a362a0c31083f304a2167d0fdadfb30fb640 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -202,10 +202,12 @@ def generate_layer_fn(op_type):
             out_var = out[0] if (isinstance(out, list) or
                                  isinstance(out, tuple)) else out
         else:
-            out_var = helper.create_tmp_variable(dtype=dtype)
+            out_var = helper.create_variable_for_type_inference(dtype=dtype)
         outputs[o_name] = [out_var]
         for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+            outputs[name] = [
+                helper.create_variable_for_type_inference(dtype=dtype)
+            ]
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
         return helper.append_activation(out_var)
@@ -229,7 +231,7 @@ def generate_layer_fn_noattr(op_type):
 
     def func(x, name=None):
         helper = LayerHelper(op_type, **locals())
-        output = helper.create_tmp_variable(dtype=x.dtype)
+        output = helper.create_variable_for_type_inference(dtype=x.dtype)
         helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
         return output
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dfd801a098d6451dbdb20d9ba44187d1e3f8a91a..149224bb68ac869dec14ac9f953f0072bd24c7e2 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -27,7 +27,7 @@ from . import nn
 from . import ops
 from . import tensor
 from ..initializer import init_on_cpu
-from ..framework import default_main_program, Parameter, unique_name
+from ..framework import default_main_program, Parameter, unique_name, name_scope
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -332,14 +332,16 @@ def append_LARS(params_grads, learning_rate, weight_decay):
             return grad_norm + weight_decay * param_norm
 
     for param, grad in params_grads:
-        param_lr = param.optimize_attr['learning_rate']
-        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
-        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
-        if type(param_lr) == float and param_lr == 1.0:
-            decayed_lr = learning_rate * param_norm \
-                / _balanced_weight(param_norm, grad_norm)
-        else:
-            decayed_lr = learning_rate * param_lr * param_norm \
-                / _balanced_weight(param_norm, grad_norm)
-        # set back param local learning rate
-        param.optimize_attr['learning_rate'] = decayed_lr
+        with param.block.program.optimized_guard(
+            [param, grad]), name_scope("optimizer"):
+            param_lr = param.optimize_attr['learning_rate']
+            param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+            grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+            if type(param_lr) == float and param_lr == 1.0:
+                decayed_lr = learning_rate * param_norm \
+                    / _balanced_weight(param_norm, grad_norm)
+            else:
+                decayed_lr = learning_rate * param_lr * param_norm \
+                    / _balanced_weight(param_norm, grad_norm)
+            # set back param local learning rate
+            param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index a3064b565d096f7feda18379c66ffc8bf2f4a55c..b2d2c93ead80d781d0a55ca541a1b0bb4232ad81 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -58,11 +58,11 @@ def accuracy(input, label, k=1, correct=None, total=None):
     """
     helper = LayerHelper("accuracy", **locals())
     topk_out, topk_indices = nn.topk(input, k=k)
-    acc_out = helper.create_tmp_variable(dtype="float32")
+    acc_out = helper.create_variable_for_type_inference(dtype="float32")
     if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
+        correct = helper.create_variable_for_type_inference(dtype="int64")
     if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
+        total = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
@@ -124,8 +124,8 @@ def auc(input,
             auc_out=fluid.layers.auc(input=prediction, label=label)
     """
     helper = LayerHelper("auc", **locals())
-    auc_out = helper.create_tmp_variable(dtype="float64")
-    batch_auc_out = helper.create_tmp_variable(dtype="float64")
+    auc_out = helper.create_variable_for_type_inference(dtype="float64")
+    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
 
     # for batch auc
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 43aa4a9e7c4ffe96a8f1346cd1bc3e4f309ba309..b555546cb7ebbcdcb18a318716ed247f042d76f8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -56,6 +56,7 @@ __all__ = [
     'sequence_expand',
     'sequence_expand_as',
     'sequence_pad',
+    'sequence_unpad',
     'lstm_unit',
     'reduce_sum',
     'reduce_mean',
@@ -64,6 +65,7 @@ __all__ = [
     'reduce_prod',
     'sequence_first_step',
     'sequence_last_step',
+    'sequence_slice',
     'dropout',
     'split',
     'ctc_greedy_decoder',
@@ -94,6 +96,7 @@ __all__ = [
     'pad_constant_like',
     'label_smooth',
     'roi_pool',
+    'roi_align',
     'dice_loss',
     'image_resize',
     'image_resize_short',
@@ -151,6 +154,12 @@ __all__ = [
     'mul',
     'sigmoid_cross_entropy_with_logits',
     'maxout',
+    'affine_grid',
+    'sequence_reverse',
+    'affine_channel',
+    'hash',
+    'log_loss',
+    'add_position_encoding',
 ]
 
 
@@ -238,7 +247,7 @@ def fc(input,
 
         w = helper.create_parameter(
             attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
+        tmp = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="mul",
             inputs={"X": input_var,
@@ -251,7 +260,7 @@ def fc(input,
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
-        pre_bias = helper.create_tmp_variable(dtype)
+        pre_bias = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="sum",
             inputs={"X": mul_results},
@@ -310,7 +319,7 @@ def embedding(input,
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
     helper.append_op(
@@ -352,7 +361,6 @@ def dynamic_lstm(input,
         c_0(Variable): The initial cell state is an optional input, default is zero.
                        This is a tensor with shape (N x D), where N is the
                        batch size. `h_0` and `c_0` can be NULL but only at the same time.
-
         param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
@@ -360,6 +368,11 @@ def dynamic_lstm(input,
                                                 W_{fh}, W_{oh}`}
                                - The shape is (D x 4D), where D is the hidden
                                  size.
+
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
@@ -372,6 +385,11 @@ def dynamic_lstm(input,
                                  - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
                                  - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
         use_peepholes (bool): ${use_peepholes_comment}
         is_reverse (bool): ${is_reverse_comment}
         gate_activation (str): ${gate_activation_comment}
@@ -390,11 +408,11 @@ def dynamic_lstm(input,
 
             hidden_dim = 512
             forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                           act=None, bias_attr=None)
+                                           bias_attr=False)
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
-
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
     helper = LayerHelper('lstm', **locals())
     size = size // 4
     weight = helper.create_parameter(
@@ -405,10 +423,10 @@ def dynamic_lstm(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    hidden = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     batch_size = input.shape[0]
     if h_0:
@@ -529,6 +547,11 @@ def dynamic_lstmp(input,
                                  size.
                                - Projection weight = {:math:`W_{rh}`}.
                                - The shape of projection weight is (D x P).
+
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
         bias_attr(ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
@@ -541,6 +564,11 @@ def dynamic_lstmp(input,
                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
                                 - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
         use_peepholes(bool): Whether to enable diagonal/peephole connections,
                              default `True`.
         is_reverse(bool): Whether to compute reversed LSTM, default `False`.
@@ -585,6 +613,7 @@ def dynamic_lstmp(input,
                                                      proj_activation="tanh")
     """
 
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
     helper = LayerHelper('lstmp', **locals())
     size = size // 4
     weight = helper.create_parameter(
@@ -597,12 +626,12 @@ def dynamic_lstmp(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    projection = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    ordered_proj0 = helper.create_tmp_variable(dtype)
-    batch_hidden = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    projection = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='lstmp',
@@ -682,8 +711,18 @@ def dynamic_gru(input,
               The first part are weights of the update gate and reset gate with
               shape :math:`(D \\times 2D)`, and the second part are weights for
               candidate hidden state with shape :math:`(D \\times D)`.
-        bias_attr(ParamAttr): The parameter attribute for learnable the
-            hidden-hidden bias.
+
+            If it is set to None or one attribute of ParamAttr, dynamic_gru will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, dynamic_gru will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
         is_reverse(bool): Whether to compute reversed GRU, default
             :attr:`False`.
         gate_activation(str): The activation for update gate and reset gate.
@@ -721,16 +760,16 @@ def dynamic_gru(input,
         attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
     batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    if h_0 != None:
+    if h_0:
         assert h_0.shape == (
             batch_size, size
         ), 'The shape of h0 should be(batch_size, %d)' % size
         inputs['H0'] = h_0
 
-    hidden = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_reset_hidden_prev = helper.create_tmp_variable(dtype)
-    batch_hidden = helper.create_tmp_variable(dtype)
+    hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='gru',
@@ -782,10 +821,29 @@ def gru_unit(input,
 
     Args:
         input (Variable): The fc transformed input value of current step.
-        hidden (Variable): The hidden value of lstm unit from previous step.
+        hidden (Variable): The hidden value of gru unit from previous step.
         size (integer): The input dimension value.
-        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
-        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, gru_unit will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
         activation (string): The activation type for cell (actNode).
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
@@ -820,9 +878,9 @@ def gru_unit(input,
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
+    gate = helper.create_variable_for_type_inference(dtype)
+    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
+    updated_hidden = helper.create_variable_for_type_inference(dtype)
     inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
     # create bias
     if helper.bias_attr:
@@ -872,10 +930,14 @@ def linear_chain_crf(input, label, param_attr=None):
         attr=helper.param_attr,
         shape=[size + 2, size],
         dtype=helper.input_dtype())
-    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
-    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    alpha = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    emission_exps = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    transition_exps = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    log_likelihood = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
     helper.append_op(
         type='linear_chain_crf',
         inputs={"Emission": [input],
@@ -914,7 +976,8 @@ def crf_decoding(input, param_attr, label=None):
     """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
     helper.append_op(
         type='crf_decoding',
         inputs={"Emission": [input],
@@ -938,9 +1001,9 @@ def cos_sim(X, Y):
         Variable: the output of cosine(X, Y).
     """
     helper = LayerHelper('cos_sim', **locals())
-    out = helper.create_tmp_variable(dtype=X.dtype)
-    xnorm = helper.create_tmp_variable(dtype=X.dtype)
-    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    out = helper.create_variable_for_type_inference(dtype=X.dtype)
+    xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
+    ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
     helper.append_op(
         type='cos_sim',
         inputs={'X': [X],
@@ -951,7 +1014,12 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
+def dropout(x,
+            dropout_prob,
+            is_test=False,
+            seed=None,
+            name=None,
+            dropout_implementation="downgrade_in_infer"):
     """
     Computes dropout.
 
@@ -971,6 +1039,21 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
                     units will be dropped. DO NOT use a fixed seed in training.
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
+        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+                                        1. downgrade_in_infer(default), downgrade the outcome at inference
+                                           train: out = input * mask
+                                           inference: out = input * dropout_prob
+                                           (make is a tensor same shape with input, value is 0 or 1
+                                            ratio of 0 is dropout_prob)
+                                        2. upscale_in_train, upscale the outcome at training time
+                                           train: out = input * mask / ( 1.0 - dropout_prob )
+                                           inference: out = input
+                                           (make is a tensor same shape with input, value is 0 or 1
+                                            ratio of 0 is dropout_prob)
+                                           dropout op can be removed from the program. 
+                                           the program will be efficient
+                                        
+
 
     Returns:
         Variable: A tensor variable is the shape with `x`.
@@ -984,8 +1067,9 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
 
     helper = LayerHelper('dropout', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    mask = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
@@ -999,7 +1083,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
             'dropout_prob': dropout_prob,
             'is_test': is_test,
             'fix_seed': seed is not None,
-            'seed': seed if seed is not None else 0
+            'seed': seed if seed is not None else 0,
+            'dropout_implementation': dropout_implementation,
         })
     return out
 
@@ -1070,7 +1155,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
           cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
     helper = LayerHelper('cross_entropy', **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='cross_entropy',
         inputs={'X': [input],
@@ -1117,14 +1202,14 @@ def square_error_cost(input, label):
 
     """
     helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='elementwise_sub',
         inputs={'X': [input],
                 'Y': [label]},
         outputs={'Out': [minus_out]})
 
-    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='square', inputs={'X': [minus_out]},
         outputs={'Out': [square_out]})
@@ -1230,12 +1315,13 @@ def chunk_eval(input,
     helper = LayerHelper("chunk_eval", **locals())
 
     # prepare output
-    precision = helper.create_tmp_variable(dtype="float32")
-    recall = helper.create_tmp_variable(dtype="float32")
-    f1_score = helper.create_tmp_variable(dtype="float32")
-    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
-    num_label_chunks = helper.create_tmp_variable(dtype="int64")
-    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
+    precision = helper.create_variable_for_type_inference(dtype="float32")
+    recall = helper.create_variable_for_type_inference(dtype="float32")
+    f1_score = helper.create_variable_for_type_inference(dtype="float32")
+    num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
+    num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
+    num_correct_chunks = helper.create_variable_for_type_inference(
+        dtype="int64")
 
     helper.append_op(
         type="chunk_eval",
@@ -1266,7 +1352,8 @@ def sequence_conv(input,
                   padding=None,
                   bias_attr=None,
                   param_attr=None,
-                  act=None):
+                  act=None,
+                  name=None):
     """
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
@@ -1278,9 +1365,19 @@ def sequence_conv(input,
         filter_size (int): the filter size (H and W).
         filter_stride (int): stride of the filter.
         padding (bool): if True, add paddings.
-        bias_attr (ParamAttr|None): attributes for bias
-        param_attr (ParamAttr|None): attributes for parameter
-        act (str): the activation type
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
 
     Returns:
         Variable: output of sequence_conv
@@ -1291,7 +1388,7 @@ def sequence_conv(input,
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='sequence_conv',
@@ -1309,7 +1406,7 @@ def sequence_conv(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
+def sequence_softmax(input, use_cudnn=False, name=None):
     """
     This function computes the softmax activation among all time-steps for each
     sequence. The dimension of each time-step should be 1. Thus, the shape of
@@ -1329,10 +1426,10 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
 
     Args:
         input (Variable): The input variable which is a LoDTensor.
-        bias_attr (ParamAttr|None): attributes for bias
-        param_attr (ParamAttr|None): attributes for parameter
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
-        library is installed. Default: False
+            library is installed. Default: False.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
 
     Returns:
         Variable: output of sequence_softmax
@@ -1347,7 +1444,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
     """
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
-    softmax_out = helper.create_tmp_variable(dtype)
+    softmax_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="sequence_softmax",
         inputs={"X": input},
@@ -1356,7 +1453,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
     return softmax_out
 
 
-def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
+def softmax(input, use_cudnn=True, name=None):
     """
     The input of the softmax operator is a tensor of any rank. The output tensor
     has the same shape as the input.
@@ -1383,10 +1480,10 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
 
     Args:
         input (Variable): The input variable.
-        bias_attr (ParamAttr): attributes for bias
-        param_attr (ParamAttr): attributes for parameter
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
-        library is installed.
+            library is installed.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
 
     Returns:
         Variable: output of softmax
@@ -1401,7 +1498,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
-    softmax_out = helper.create_tmp_variable(dtype)
+    softmax_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="softmax",
         inputs={"X": input},
@@ -1492,14 +1589,23 @@ def conv2d(input,
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        act (str): Activation type. Default: None
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
         name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+            will be named automatically. Default: None
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1517,7 +1623,7 @@ def conv2d(input,
     """
 
     num_channels = input.shape[1]
-
+    assert param_attr is not False, "param_attr should not be False here."
     l_type = 'conv2d'
     if (num_channels == groups and num_filters % num_channels == 0 and
             not use_cudnn):
@@ -1545,7 +1651,8 @@ def conv2d(input,
     filter_shape = [num_filters, int(num_filter_channels)] + filter_size
 
     def _get_default_param_initializer():
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+        std = (2.0 / filter_elem_num)**0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
@@ -1554,7 +1661,7 @@ def conv2d(input,
         dtype=dtype,
         default_initializer=_get_default_param_initializer())
 
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -1656,13 +1763,22 @@ def conv3d(input,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
-        param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None
-        bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        act (str): Activation type. Default: None
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+            will be named automatically. Default: None.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1680,7 +1796,7 @@ def conv3d(input,
     """
 
     l_type = 'conv3d'
-
+    assert param_attr is not False, "param_attr should not be False here."
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 
@@ -1705,7 +1821,9 @@ def conv3d(input,
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
     def _get_default_param_initializer():
-        std = (2.0 / (filter_size[0]**3 * num_channels))**0.5
+        filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
+            2] * num_channels
+        std = (2.0 / filter_elem_num)**0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
@@ -1714,7 +1832,7 @@ def conv3d(input,
         dtype=dtype,
         default_initializer=_get_default_param_initializer())
 
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -1737,7 +1855,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_pool(input, pool_type):
+def sequence_pool(input, pool_type, is_test=False):
     """
     This function add the operator for sequence pooling.
     It pools features of all time-steps of each instance, and is applied
@@ -1774,6 +1892,7 @@ def sequence_pool(input, pool_type):
         input(variable): The input variable which is a LoDTensor.
         pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
 
     Returns:
         The sequence pooling variable which is a Tensor.
@@ -1793,15 +1912,16 @@ def sequence_pool(input, pool_type):
     """
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    max_index = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type="sequence_pool",
         inputs={"X": input},
         outputs={"Out": pool_out,
                  "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
 
     # when pool_type is max, variable max_index is initialized,
     # so we stop the gradient explicitly here
@@ -1830,7 +1950,7 @@ def sequence_concat(input, name=None):
            out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
     """
     helper = LayerHelper('sequence_concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
     return out
@@ -1902,6 +2022,76 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+def sequence_slice(input, offset, length, name=None):
+    """
+    **Sequence Slice Layer**
+
+    The layer crops a subsequence from given sequence with given start
+    offset and subsequence length.
+
+    It only supports sequence data (LoDTensor with lod_level equal to 1).
+
+    .. code-block:: text
+
+	- Case:
+
+            Given the input Variable **input**:
+
+                input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
+                input.lod = [[3, 2]],
+                input.dims = (5, 2),
+
+            with offset.data = [[0], [1]] and length.data = [[2], [1]],
+
+            the output Variable will be
+
+                out.data = [[a1, a2], [b1, b2], [e1, e2]],
+                out.lod = [[2, 1]],
+                out.dims = (3, 2).
+
+    NOTE: The first dimension size of **input**, **offset** and **length**
+          should be equal. The **offset** should start from 0.
+
+    Args:
+        input(Variable): The input Variable which consists of the complete
+                         sequences.
+        offset(Variable): The offset to slice each sequence.
+        length(Variable): The length of each subsequence.
+        name(str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The output subsequences.
+
+    Examples:
+
+        .. code-block:: python
+
+             import numpy as np
+             seqs = fluid.layers.data(name='x', shape=[10, 5],
+                              dtype='float32', lod_level=1)
+             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
+             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
+             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+                                                   length=length)
+    """
+    helper = LayerHelper("sequence_slice", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+
+    offset.stop_gradient = True
+    length.stop_gradient = True
+
+    helper.append_op(
+        type="sequence_slice",
+        inputs={"X": input,
+                "Offset": offset,
+                "Length": length},
+        outputs={"Out": out})
+
+    return out
+
+
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -1973,7 +2163,7 @@ def pool2d(input,
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -2041,7 +2231,7 @@ def pool3d(input,
     l_type = "pool3d"
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -2107,8 +2297,14 @@ def batch_norm(input,
         is_test(bool, Default False): Used for training or training.
         momentum(float, Default 0.9):
         epsilon(float, Default 1e-05):
-        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
-        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
         data_layout(string, default NCHW): NCHW|NHWC
         in_place(bool, Default False): Make the input and output of batch norm reuse memory.
         name(string, Default None): A name for this layer(optional). If set None, the layer
@@ -2128,6 +2324,7 @@ def batch_norm(input,
             hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
             hidden2 = fluid.layers.batch_norm(input=hidden1)
     """
+    assert bias_attr is not False, "bias_attr should not be False in batch_norm."
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
@@ -2177,10 +2374,13 @@ def batch_norm(input,
     mean_out = mean
     # variance and variance out share the same memory
     variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
+        dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -2244,19 +2444,28 @@ def layer_norm(input,
     Args:
         input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
-            normalization.
+            normalization. Default True.
         shift(bool): Whether to learn the adaptive bias :math:`b` after
-            normalization.
-        begin_norm_axis(bool): The normalization will be performed along
+            normalization. Default True.
+        begin_norm_axis(int): The normalization will be performed along
             dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+            Default 1.
         epsilon(float): The small value added to the variance to prevent
-            division by zero.
+            division by zero. Default 1e-05.
         param_attr(ParamAttr|None): The parameter attribute for the learnable
-            gain :math:`g`.
+            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default None.
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
-            bias :math:`b`.
+            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
         act(str): Activation to be applied to the output of layer normalizaiton.
-        name (str): The name of this layer. It is optional.
+                  Default None.
+        name(str): The name of this layer. It is optional. Default None, and a
+                   unique name would be generated automatically.
 
     Returns:
         ${y_comment}
@@ -2288,9 +2497,11 @@ def layer_norm(input,
         inputs['Bias'] = bias
 
     # create output
-    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    layer_norm_out = helper.create_tmp_variable(dtype)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type="layer_norm",
@@ -2397,15 +2608,22 @@ def conv2d_transpose(input,
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
-            Default: groups=1
-        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                               Default: None
-        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+            Default: groups = 1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
         use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act(str): Activation type. Default: None
+            library is installed. Default: True.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+            will be named automatically. Default: True.
 
     Returns:
         Variable: The tensor variable storing the convolution transpose result.
@@ -2420,7 +2638,7 @@ def conv2d_transpose(input,
           data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
           conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
-
+    assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
     input_channel = input.shape[1]
 
     op_type = 'conv2d_transpose'
@@ -2456,6 +2674,7 @@ def conv2d_transpose(input,
     else:
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
+
     if output_size is None:
         output_size = []
     elif isinstance(output_size, list) or isinstance(output_size, int):
@@ -2465,10 +2684,11 @@ def conv2d_transpose(input,
     padding = utils.convert_to_list(padding, 2, 'padding')
     groups = 1 if groups is None else groups
     filter_shape = [input_channel, num_filters // groups] + filter_size
+
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type=op_type,
         inputs={'Input': [input],
@@ -2577,12 +2797,19 @@ def conv3d_transpose(input,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer.
-            Default: None
-        bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
         use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        act(str): Activation type. Default: None
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
 
@@ -2599,6 +2826,7 @@ def conv3d_transpose(input,
           data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
           conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
     """
+    assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
     l_type = "conv3d_transpose"
     helper = LayerHelper(l_type, **locals())
     if not isinstance(input, Variable):
@@ -2638,7 +2866,7 @@ def conv3d_transpose(input,
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type=l_type,
         inputs={'Input': [input],
@@ -2717,7 +2945,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     """
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sequence_expand',
         inputs={'X': x,
@@ -2783,7 +3011,7 @@ def sequence_expand_as(x, y, name=None):
     """
     helper = LayerHelper('sequence_expand_as', input=x, **locals())
     dtype = helper.input_dtype()
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sequence_expand_as',
         inputs={'X': x,
@@ -2793,7 +3021,7 @@ def sequence_expand_as(x, y, name=None):
 
 
 @templatedoc()
-def sequence_pad(x, pad_value, maxlen=None):
+def sequence_pad(x, pad_value, maxlen=None, name=None):
     """
     ${comment}
 
@@ -2807,7 +3035,9 @@ def sequence_pad(x, pad_value, maxlen=None):
             None or any positive int. When it is None, all sequences will be
             padded up to the length of the longest one among them; when it a
             certain positive value, it must be greater than the length of the
-            longest original sequence."
+            longest original sequence.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The padded sequence batch and the original lengths before
@@ -2820,14 +3050,15 @@ def sequence_pad(x, pad_value, maxlen=None):
 
             x = fluid.layers.data(name='y', shape=[10, 5],
                              dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0]))
+            pad_value = fluid.layers.assign(
+                input=numpy.array([0], dtype=numpy.float32))
             out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
     """
 
     helper = LayerHelper('sequence_pad', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
-    length = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
+    length = helper.create_variable_for_type_inference(dtype)
 
     pad_value.stop_gradient = True
     length.stop_gradient = True
@@ -2844,6 +3075,66 @@ def sequence_pad(x, pad_value, maxlen=None):
     return out, length
 
 
+def sequence_unpad(x, length, name=None):
+    """
+    **Sequence Unpad Layer**
+
+    This layer removes the padding data in the input sequences and convert
+    them into sequences with actual length as output, identitied by lod
+    information.
+
+    .. code-block:: text
+
+	Example:
+
+	Given input Variable **x**:
+	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
+		      [11.0, 12.0, 13.0, 14.0, 15.0]],
+
+	in which there are 3 sequences padded to length 5, and the acutal length
+	specified by input Variable **length**:
+
+	    length.data = [[2], [3], [4]],
+
+	after unpadding, the output Variable will be:
+
+	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+	    out.lod = [[2, 3, 4]]
+
+    Args:
+        x(Variable): Input Variable which contains the padded sequences with
+            equal length.
+        length(Variable): The Variable that specifies the actual ength of
+            sequences after unpadding.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The Variable contains the unpadded sequences.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
+            len = fluid.layers.data(name='length', shape=[1], dtype='int64')
+            out = fluid.layers.sequence_unpad(x=x, length=len)
+    """
+
+    helper = LayerHelper('sequence_unpad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+
+    length.stop_gradient = True
+
+    helper.append_op(
+        type='sequence_unpad',
+        inputs={'X': x,
+                'Length': length},
+        outputs={'Out': out})
+    return out
+
+
 def beam_search(pre_ids,
                 pre_scores,
                 ids,
@@ -2931,8 +3222,9 @@ def beam_search(pre_ids,
     score_type = scores.dtype
     id_type = ids.dtype
 
-    selected_scores = helper.create_tmp_variable(dtype=score_type)
-    selected_ids = helper.create_tmp_variable(dtype=id_type)
+    selected_scores = helper.create_variable_for_type_inference(
+        dtype=score_type)
+    selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
 
     helper.append_op(
         type='beam_search',
@@ -2989,8 +3281,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
                 ids, scores, beam_size=5, end_id=0)
     """
     helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
+    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
 
     helper.append_op(
         type="beam_search_decode",
@@ -3055,10 +3347,18 @@ def lstm_unit(x_t,
         cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
             shape M x S, M for batch size and S for size of lstm unit.
         forget_bias (float): The forget bias of lstm unit.
-        param_attr (ParamAttr): The attributes of parameter weights, used to set
-            initializer, name etc.
-        bias_attr (ParamAttr): The attributes of bias weights, if not False,
-            bias weights will be created and be set to default value.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.
+                               If it is set to None or one attribute of ParamAttr,
+                               lstm_unit will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                              weights. If it is set to False, no bias will be added
+                              to the output units. If it is set to None or one attribute of ParamAttr,
+                              lstm_unit will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
 
@@ -3112,8 +3412,8 @@ def lstm_unit(x_t,
                 param_attr=param_attr,
                 bias_attr=bias_attr)
     dtype = x_t.dtype
-    c = helper.create_tmp_variable(dtype)
-    h = helper.create_tmp_variable(dtype)
+    c = helper.create_variable_for_type_inference(dtype)
+    h = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='lstm_unit',
@@ -3167,7 +3467,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
     """
     helper = LayerHelper('reduce_sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3224,7 +3524,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3279,7 +3579,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3334,7 +3634,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3390,7 +3690,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3450,7 +3750,7 @@ def split(input, num_or_sections, dim=-1, name=None):
             dim], 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
     outs = [
-        helper.create_tmp_variable(dtype=helper.input_dtype())
+        helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
     helper.append_op(
@@ -3507,8 +3807,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         axis = 0
     helper = LayerHelper("l2_normalize", **locals())
 
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    norm = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="norm",
         inputs={"X": x},
@@ -3617,7 +3917,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     __check_input(x, y)
 
     helper = LayerHelper('matmul', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='matmul',
         inputs={'X': x,
@@ -3688,8 +3988,8 @@ def topk(input, k, name=None):
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     helper = LayerHelper("top_k", **locals())
-    values = helper.create_tmp_variable(dtype=input.dtype)
-    indices = helper.create_tmp_variable(dtype="int64")
+    values = helper.create_variable_for_type_inference(dtype=input.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="top_k",
         inputs={"X": [input]},
@@ -3747,8 +4047,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
 
     # remove some tokens from input and labels
     if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_tmp_variable(dtype="int64")
-        erased_label = helper.create_tmp_variable(dtype="int64")
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
 
         helper.append_op(
             type="sequence_erase",
@@ -3765,8 +4065,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
         label = erased_label
 
     # edit distance op
-    edit_distance_out = helper.create_tmp_variable(dtype="int64")
-    sequence_num = helper.create_tmp_variable(dtype="int64")
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="edit_distance",
         inputs={"Hyps": [input],
@@ -3841,7 +4141,7 @@ def ctc_greedy_decoder(input, blank, name=None):
     _, topk_indices = topk(input, k=1)
 
     # ctc align op
-    ctc_out = helper.create_tmp_variable(dtype="int64")
+    ctc_out = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="ctc_align",
         inputs={"Input": [topk_indices]},
@@ -3891,8 +4191,8 @@ def warpctc(input, label, blank=0, norm_by_times=False):
 
     """
     helper = LayerHelper('warpctc', **locals())
-    loss_out = helper.create_tmp_variable(dtype=input.dtype)
-    grad_out = helper.create_tmp_variable(dtype=input.dtype)
+    loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='warpctc',
         inputs={'Logits': [input],
@@ -3953,7 +4253,7 @@ def sequence_reshape(input, new_dim):
             x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     helper = LayerHelper('sequence_reshape', **locals())
-    out = helper.create_tmp_variable(helper.input_dtype())
+    out = helper.create_variable_for_type_inference(helper.input_dtype())
     helper.append_op(
         type='sequence_reshape',
         inputs={'X': [input]},
@@ -3972,7 +4272,8 @@ def nce(input,
         sample_weight=None,
         param_attr=None,
         bias_attr=None,
-        num_neg_samples=None):
+        num_neg_samples=None,
+        name=None):
     """
     ${comment}
 
@@ -3983,9 +4284,18 @@ def nce(input,
         sample_weight (Variable|None): A Variable of shape [batch_size, 1]
             storing a weight for each sample. The default weight for each
             sample is 1.0.
-        param_attr (ParamAttr|None): attributes for parameter
-        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+             of nce. If it is set to None or one attribute of ParamAttr, nce
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
+             If it is set to False, no bias will be added to the output units.
+             If it is set to None or one attribute of ParamAttr, nce
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
         num_neg_samples (int): ${num_neg_samples_comment}
+        name (str|None): A name for this layer(optional). If set None, the layer
+             will be named automatically. Default: None.
 
     Returns:
         Variable: The output nce loss.
@@ -4018,22 +4328,31 @@ def nce(input,
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
-    dim = input.shape[1]
     assert isinstance(label, Variable)
+
+    dim = input.shape[1]
     num_true_class = label.shape[1]
     w = helper.create_parameter(
         attr=helper.param_attr,
         shape=[num_total_classes, dim],
         is_bias=False,
         dtype=input.dtype)
-    b = helper.create_parameter(
-        attr=helper.bias_attr,
-        shape=[num_total_classes, 1],
-        is_bias=True,
-        dtype=input.dtype)
-    cost = helper.create_tmp_variable(dtype=input.dtype)
-    sample_logits = helper.create_tmp_variable(dtype=input.dtype)
-    sample_labels = helper.create_tmp_variable(dtype=label.dtype)
+    inputs = {
+        'Input': input,
+        'Label': label,
+        'Weight': w,
+        'SampleWeight': sample_weight if sample_weight is not None else []
+    }
+    if helper.bias_attr:
+        b = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=[num_total_classes, 1],
+            is_bias=True,
+            dtype=input.dtype)
+        inputs['Bias'] = b
+    cost = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
 
     if num_neg_samples is None:
         num_neg_samples = 10
@@ -4047,13 +4366,7 @@ def nce(input,
 
     helper.append_op(
         type='nce',
-        inputs={
-            'Input': input,
-            'Label': label,
-            'Weight': w,
-            'Bias': b,
-            'SampleWeight': sample_weight if sample_weight is not None else []
-        },
+        inputs=inputs,
         outputs={
             'Cost': cost,
             'SampleLogits': sample_logits,
@@ -4063,7 +4376,12 @@ def nce(input,
     return cost / (num_neg_samples + 1)
 
 
-def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
+def hsigmoid(input,
+             label,
+             num_classes,
+             param_attr=None,
+             bias_attr=None,
+             name=None):
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
@@ -4084,11 +4402,17 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
         num_classes: (int), The number of classes, must not be less than 2.
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter
-             attribute for learnable parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter
-             attribute for the bias of this layer. If it is set to False, no
-             bias will be applied.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+             of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of hsigmoid.
+             If it is set to False, no bias will be added to the output units.
+             If it is set to None or one attribute of ParamAttr, hsigmoid
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        name (str|None): A name for this layer(optional). If set None, the layer
+             will be named automatically. Default: None.
 
     Returns:
         Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
@@ -4104,8 +4428,8 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
 
     helper = LayerHelper('hierarchical_sigmoid', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
-    pre_out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
+    pre_out = helper.create_variable_for_type_inference(dtype)
     dim = input.shape[1]
     if num_classes < 2:
         raise ValueError("num_classes must not be less than 2.")
@@ -4149,7 +4473,10 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
+            # use append_batch_size=False to avoid prepending extra 
+            # batch size in shape
+            x = fluid.layers.data(name='x', shape=[5, 10, 15], 
+                            dtype='float32', append_batch_size=False)
             x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
 
@@ -4165,8 +4492,8 @@ def transpose(x, perm, name=None):
                 (idx, perm[idx], len(x.shape)))
 
     helper = LayerHelper('transpose', **locals())
-    out = helper.create_tmp_variable(x.dtype)
-    x_shape = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='transpose2',
         inputs={'X': [x]},
@@ -4308,7 +4635,7 @@ def im2sequence(input,
         inputs["Y"] = input_image_size
         attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
@@ -4341,7 +4668,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     filter_shape = [future_context_size + 1, input.shape[1]]
     filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='row_conv',
         inputs={'X': [input],
@@ -4374,7 +4701,7 @@ def multiplex(inputs, index):
         raise ValueError("inputs should be a list object and contains at least "
                          "2 elements.")
 
-    out = helper.create_tmp_variable(inputs[0].dtype)
+    out = helper.create_variable_for_type_inference(inputs[0].dtype)
     helper.append_op(
         type='multiplex',
         inputs={'X': inputs,
@@ -4386,7 +4713,8 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
-                               ignore_index=-100):
+                               ignore_index=-100,
+                               numeric_stable_mode=False):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4420,6 +4748,18 @@ def softmax_with_cross_entropy(logits,
         \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
         \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
 
+    3) If numeric_stable_mode is True, softmax is calculated first by:
+
+    .. math::
+        
+        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+
+        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+
+        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
     Args:
         logits (Variable): The unscaled log probabilities, which is a 2-D tensor
             with shape [N x K]. N is the batch_size, and K is the class number.
@@ -4431,6 +4771,13 @@ def softmax_with_cross_entropy(logits,
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
                             if soft_label is set to False. Default: -100
+        numeric_stable_mode (bool): A flag to indicate whether to use a more
+                                    numerically stable algorithm. Only valid
+                                    when soft_label is False and GPU is used.
+                                    When soft_label is True or CPU is used, 
+                                    the algorithm is always numerically stable. 
+                                    Note that the speed may be slower when use 
+                                    stable algorithm. Default: False
 
     Returns:
         Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
@@ -4445,16 +4792,19 @@ def softmax_with_cross_entropy(logits,
                 logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_tmp_variable(dtype=logits.dtype)
-    loss = helper.create_tmp_variable(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
     helper.append_op(
         type='softmax_with_cross_entropy',
         inputs={'Logits': logits,
                 'Label': label},
         outputs={'Softmax': softmax,
                  'Loss': loss},
-        attrs={'soft_label': soft_label,
-               'ignore_index': ignore_index})
+        attrs={
+            'soft_label': soft_label,
+            'ignore_index': ignore_index,
+            'numeric_stable_mode': numeric_stable_mode
+        })
     return loss
 
 
@@ -4496,8 +4846,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
 
     helper = LayerHelper('smooth_l1_loss', **locals())
-    diff = helper.create_tmp_variable(dtype=x.dtype)
-    loss = helper.create_tmp_variable(dtype=x.dtype)
+    diff = helper.create_variable_for_type_inference(dtype=x.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='smooth_l1_loss',
         inputs={
@@ -4530,7 +4880,7 @@ def one_hot(input, depth):
             one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
-    one_hot_out = helper.create_tmp_variable(dtype='float32')
+    one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
     helper.append_op(
         type="one_hot",
         inputs={'X': input},
@@ -4578,7 +4928,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
-def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     Gives a new shape to the input Tensor without changing its data.
 
@@ -4626,15 +4976,22 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                 :attr:`shape` specifying shape. That is to
                                 say :attr:`actual_shape` has a higher priority
                                 than :attr:`shape`.
-        act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, the output
-                       shares data with input without copying, otherwise
-                       a new output tensor is created
-                       whose data is copied from input x.
+        act (str): The non-linear activation to be applied to the reshaped tensor
+                   variable.
+        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
+                       operators. If this flag is set :attr:`True`, reuse input
+                       :attr:`x` to reshape, which will change the shape of
+                       tensor variable :attr:`x` and might cause errors when
+                       :attr:`x` is used in multiple operators. If :attr:`False`,
+                       preserve the shape :attr:`x` and create a new output tensor
+                       variable whose data is copied from input x but reshaped.
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: The output tensor.
+        Variable: The reshaped tensor variable if :attr:`act` is None. It is a \
+                  new tensor variable if :attr:`inplace` is :attr:`False`, \
+                  otherwise it is :attr:`x`. If :attr:`act` is not None, return \
+                  the activated tensor variable.
 
     Raises:
         TypeError: if actual_shape is neither Variable nor None.
@@ -4645,7 +5002,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
             data = fluid.layers.data(
                 name='data', shape=[2, 4, 6], dtype='float32')
             reshaped = fluid.layers.reshape(
-                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+                x=data, shape=[-1, 0, 3, 2], inplace=True)
     """
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
@@ -4672,8 +5029,9 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape2", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    x_shape = helper.create_tmp_variable(dtype=x.dtype)
+    out = x if inplace else helper.create_variable_for_type_inference(
+        dtype=x.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="reshape2",
         inputs=inputs,
@@ -4722,8 +5080,8 @@ def squeeze(input, axes, name=None):
             y = layers.sequeeze(input=x, axes=[1])
     """
     helper = LayerHelper("squeeze", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type="squeeze2",
         inputs={"X": input},
@@ -4759,8 +5117,8 @@ def unsqueeze(input, axes, name=None):
             y = layers.unsequeeze(input=x, axes=[1])
     """
     helper = LayerHelper("unsqueeze", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type="unsqueeze2",
         inputs={"X": input},
@@ -4850,7 +5208,7 @@ def lod_reset(x, y=None, target_lod=None):
             out = layers.lod_reset(x=x, y=y)
     """
     helper = LayerHelper("lod_reset", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     if y is not None:
         helper.append_op(
             type="lod_reset", inputs={'X': x,
@@ -4919,8 +5277,9 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
             "dims of input must be 4(not %d), and it's order must be NCHW" %
             (dims))
 
-    mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    lrn_out = helper.create_tmp_variable(dtype)
+    mid_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    lrn_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="lrn",
         inputs={"X": input},
@@ -4985,7 +5344,7 @@ def pad(x, paddings, pad_value=0., name=None):
     """
     helper = LayerHelper('pad', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad',
         inputs={'X': x},
@@ -5065,7 +5424,7 @@ def pad_constant_like(x, y, pad_value=0., name=None):
     """
     helper = LayerHelper('pad_constant_like', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad_constant_like',
         inputs={'X': x,
@@ -5130,7 +5489,7 @@ def label_smooth(label,
         raise ValueError("The value of epsilon must be between 0 and 1.")
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
-    smooth_label = helper.create_tmp_variable(dtype)
+    smooth_label = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="label_smooth",
         inputs={"X": label,
@@ -5162,8 +5521,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    argmaxes = helper.create_tmp_variable(dtype='int32')
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
         type="roi_pool",
         inputs={"X": input,
@@ -5178,6 +5537,54 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     return pool_out
 
 
+@templatedoc()
+def roi_align(input,
+              rois,
+              pooled_height=1,
+              pooled_width=1,
+              spatial_scale=1.0,
+              sampling_ratio=-1,
+              name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        sampling_ratio(intger): ${sampling_ratio_comment} Default: -1
+
+    Returns:
+        Variable: ${out_comment}.
+    Examples:
+        .. code-block:: python
+
+            align_out = fluid.layers.roi_align(input=x,
+                                               rois=rois,
+                                               pooled_height=7,
+                                               pooled_width=7,
+                                               spatial_scale=0.5,
+                                               sampling_ratio=-1)
+    """
+    helper = LayerHelper('roi_align', **locals())
+    dtype = helper.input_dtype()
+    align_out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="roi_align",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": align_out},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale,
+            "sampling_ratio": sampling_ratio
+        })
+    return align_out
+
+
 def dice_loss(input, label, epsilon=0.00001):
     """
     Dice loss for comparing the similarity of two batch of data,
@@ -5288,7 +5695,7 @@ def image_resize(input,
         out_h = int(input.shape[2] * scale)
         out_w = int(input.shape[3] * scale)
 
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type=resample_methods[resample],
         inputs=inputs,
@@ -5397,7 +5804,7 @@ def gather(input, index):
     """
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",
         inputs={"X": input,
@@ -5437,7 +5844,7 @@ def scatter(input, index, updates, name=None):
     """
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="scatter",
         inputs={"X": input,
@@ -5497,7 +5904,7 @@ def sequence_scatter(input, index, updates, name=None):
     """
     helper = LayerHelper('sequence_scatter', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="sequence_scatter",
         inputs={"X": input,
@@ -5527,7 +5934,7 @@ def random_crop(x, shape, seed=None):
     """
     helper = LayerHelper("random_crop", **locals())
     dtype = x.dtype
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     if seed is None:
         seed = np.random.randint(-65536, 65536)
     op_attrs = {"shape": shape}
@@ -5573,7 +5980,7 @@ def log(x, name=None):
     """
     helper = LayerHelper('log', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -5604,7 +6011,7 @@ def relu(x, name=None):
     """
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -5643,9 +6050,9 @@ def mean_iou(input, label, num_classes):
     """
     helper = LayerHelper('mean_iou', **locals())
     dtype = helper.input_dtype()
-    out_mean_iou = helper.create_tmp_variable(dtype='float32')
-    out_wrong = helper.create_tmp_variable(dtype='int32')
-    out_correct = helper.create_tmp_variable(dtype='int32')
+    out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
+    out_wrong = helper.create_variable_for_type_inference(dtype='int32')
+    out_correct = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
         type="mean_iou",
         inputs={"Predictions": input,
@@ -5737,7 +6144,7 @@ def crop(x, shape=None, offsets=None, name=None):
     if offsets is None:
         offsets = [0] * len(x.shape)
 
-    out = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x}
     attrs = {}
     if isinstance(shape, Variable):
@@ -5757,6 +6164,124 @@ def crop(x, shape=None, offsets=None, name=None):
     return out
 
 
+def affine_grid(theta, out_shape, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    .. code-block:: text
+
+        * Case 1:
+
+          Given:
+
+              theta = [[[x_11, x_12, x_13]
+                        [x_14, x_15, x_16]]
+                       [[x_21, x_22, x_23]
+                        [x_24, x_25, x_26]]]
+      
+              out_shape = [2, 3, 5, 5]
+      
+          Step 1:
+      
+              Generate normalized coordinates according to out_shape.
+              The values of the normalized coordinates are in the interval between -1 and 1.
+              The shape of the normalized coordinates is [2, H, W] as below:
+      
+              C = [[[-1.  -1.  -1.  -1.  -1. ]
+                    [-0.5 -0.5 -0.5 -0.5 -0.5]
+                    [ 0.   0.   0.   0.   0. ]
+                    [ 0.5  0.5  0.5  0.5  0.5]
+                    [ 1.   1.   1.   1.   1. ]]
+                   [[-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]]]
+              C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+
+          Step2:
+
+              Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+              C_ = [[-1.  -1.   1. ]
+                    [-0.5 -1.   1. ]
+                    [ 0.  -1.   1. ]
+                    [ 0.5 -1.   1. ]
+                    [ 1.  -1.   1. ]
+                    [-1.  -0.5  1. ]
+                    [-0.5 -0.5  1. ]
+                    [ 0.  -0.5  1. ]
+                    [ 0.5 -0.5  1. ]
+                    [ 1.  -0.5  1. ]
+                    [-1.   0.   1. ]
+                    [-0.5  0.   1. ]
+                    [ 0.   0.   1. ]
+                    [ 0.5  0.   1. ]
+                    [ 1.   0.   1. ]
+                    [-1.   0.5  1. ]
+                    [-0.5  0.5  1. ]
+                    [ 0.   0.5  1. ]
+                    [ 0.5  0.5  1. ]
+                    [ 1.   0.5  1. ]
+                    [-1.   1.   1. ]
+                    [-0.5  1.   1. ]
+                    [ 0.   1.   1. ]
+                    [ 0.5  1.   1. ]
+                    [ 1.   1.   1. ]]
+          Step3:
+              Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+
+    Args:
+        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
+        out_shape can be a Variable or a list or tuple.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The output with shape [N, H, W, 2].
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
+            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
+            data = fluid.layers.affine_grid(theta, out_shape)
+
+            # or
+            data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
+
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+        isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Variable.")
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Variable.")
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
 def rank_loss(label, left, right, name=None):
     """
     **Rank loss layer for RankNet**
@@ -5817,7 +6342,7 @@ def rank_loss(label, left, right, name=None):
     if not (isinstance(right, Variable)):
         raise ValueError("The right should be a Variable")
 
-    out = helper.create_tmp_variable("float32")
+    out = helper.create_variable_for_type_inference("float32")
 
     helper.append_op(
         type='rank_loss',
@@ -5863,8 +6388,8 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
         raise ValueError("The left should be a Variable.")
     if not isinstance(right, Variable):
         raise ValueError("The right should be a Variable.")
-    out = helper.create_tmp_variable(left.dtype)
-    act = helper.create_tmp_variable(left.dtype)
+    out = helper.create_variable_for_type_inference(left.dtype)
+    act = helper.create_variable_for_type_inference(left.dtype)
     helper.append_op(
         type='margin_rank_loss',
         inputs={"Label": label,
@@ -5949,7 +6474,7 @@ def pad2d(input,
 
     helper = LayerHelper('pad2d', **locals())
     dtype = helper.input_dtype(input_param_name='input')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad2d',
         inputs={'X': input},
@@ -5978,7 +6503,7 @@ def elu(x, alpha=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('elu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='elu',
         inputs={'X': x},
@@ -6001,7 +6526,7 @@ def relu6(x, threshold=6.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('relu6', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='relu6',
         inputs={'X': x},
@@ -6024,7 +6549,7 @@ def pow(x, factor=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('pow', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='pow',
         inputs={'X': x},
@@ -6048,7 +6573,7 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('stanh', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='stanh',
         inputs={'X': x},
@@ -6073,7 +6598,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('hard_sigmoid', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='hard_sigmoid',
         inputs={'X': x},
@@ -6097,7 +6622,7 @@ def swish(x, beta=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('swish', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='swish',
         inputs={'X': x},
@@ -6149,7 +6674,7 @@ def prelu(x, mode, param_attr=None, name=None):
         dtype='float32',
         is_bias=False,
         default_initializer=Constant(1.0))
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="prelu",
         inputs={"X": x,
@@ -6173,7 +6698,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('brelu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='brelu',
         inputs={'X': x},
@@ -6196,7 +6721,7 @@ def leaky_relu(x, alpha=0.02, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='leaky_relu',
         inputs={'X': x},
@@ -6218,7 +6743,7 @@ def soft_relu(x, threshold=40.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('soft_relu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='soft_relu',
         inputs={'X': x},
@@ -6285,8 +6810,8 @@ def flatten(x, axis=1, name=None):
     if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
         raise ValueError("The axis should be a int, and in range [0, rank(x)]")
 
-    out = helper.create_tmp_variable(x.dtype)
-    x_shape = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='flatten2',
         inputs={"X": x},
@@ -6332,7 +6857,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
             out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     helper = LayerHelper('sequence_enumerate', **locals())
-    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
     helper.append_op(
         type='sequence_enumerate',
         inputs={'X': input},
@@ -6372,9 +6898,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 
     helper = LayerHelper('sequence_mask', **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=dtype)
+        out = helper.create_variable_for_type_inference(dtype=dtype)
     else:
-        out = helper.create_tmp_variable(dtype=dtype, name=name)
+        out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
 
     helper.append_op(
         type='sequence_mask',
@@ -6417,7 +6943,7 @@ def stack(x, axis=0):
     if not isinstance(x, list) and not isinstance(x, tuple):
         x = [x]
 
-    out = helper.create_tmp_variable(x[0].dtype)
+    out = helper.create_variable_for_type_inference(x[0].dtype)
     helper.append_op(
         type='stack', inputs={'X': x}, outputs={'Y': out},
         attrs={'axis': axis})
@@ -6455,7 +6981,7 @@ def unstack(x, axis=0, num=None):
 
     outs = []
     for _ in num:
-        outs.append(helper.create_tmp_variable(x.dtype))
+        outs.append(helper.create_variable_for_type_inference(x.dtype))
 
     helper.append_op(
         type='unstack',
@@ -6507,7 +7033,7 @@ def expand(x, expand_times, name=None):
     """
     helper = LayerHelper('expand', input=x, **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='expand',
         inputs={'X': x},
@@ -6546,7 +7072,7 @@ def uniform_random_batch_size_like(input,
     """
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='uniform_random_batch_size_like',
@@ -6583,7 +7109,7 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     """
 
     helper = LayerHelper('gaussian_random', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='gaussian_random',
@@ -6618,7 +7144,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     """
 
     helper = LayerHelper('sampling_id', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sampling_id',
         inputs={'X': x},
@@ -6657,7 +7183,7 @@ def gaussian_random_batch_size_like(input,
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='gaussian_random_batch_size_like',
@@ -6689,7 +7215,8 @@ def sum(x):
     """
 
     helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('x'))
     helper.append_op(
         type='sum',
         inputs={'X': x},
@@ -6716,7 +7243,8 @@ def slice(input, axes, starts, ends):
     """
 
     helper = LayerHelper('slice', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('input'))
     helper.append_op(
         type='slice',
         inputs={'Input': input},
@@ -6742,7 +7270,8 @@ def shape(input):
     """
 
     helper = LayerHelper('shape', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('input'))
     helper.append_op(
         type='shape', inputs={'Input': input}, outputs={'Out': out})
 
@@ -6759,7 +7288,7 @@ def _elementwise_op(helper):
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -6793,7 +7322,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
     helper = LayerHelper('scale', **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -6859,7 +7388,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 
     if out is None:
         if name is None:
-            out = helper.create_tmp_variable(dtype=x.dtype)
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
         else:
             out = helper.create_variable(
                 name=name, dtype=x.dtype, persistable=False)
@@ -6967,7 +7496,7 @@ def clip(x, min, max, name=None):
     helper = LayerHelper("clip", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -6999,7 +7528,7 @@ def clip_by_norm(x, max_norm, name=None):
     helper = LayerHelper("clip_by_norm", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7029,7 +7558,7 @@ def mean(x, name=None):
     helper = LayerHelper("mean", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7059,7 +7588,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     helper = LayerHelper("mul", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7093,7 +7622,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7123,7 +7652,7 @@ def maxout(x, groups, name=None):
     helper = LayerHelper("maxout", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7134,3 +7663,195 @@ def maxout(x, groups, name=None):
         attrs={"groups": groups},
         outputs={"Out": out})
     return out
+
+
+@templatedoc()
+def sequence_reverse(x, name=None):
+    """ 
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${y_type}): ${y_comment}
+    """
+    helper = LayerHelper("sequence_reverse", **locals())
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="sequence_reverse",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs=dict())
+    return out
+
+
+def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
+    """
+    Applies a separate affine transformation to each channel of the input.
+    Useful for replacing spatial batch norm with its equivalent fixed
+    transformation. The input also can be 2D tensor and applies a affine
+    transformation in second dimension.
+
+    Args:
+        x (Variable): Feature map input can be a 4D tensor with order NCHW
+            or NHWC. It also can be a 2D tensor and the affine transformation
+            is applied in the second dimension.
+        scale (Variable): 1D input of shape (C), the c-th element is the scale
+            factor of the affine transformation for the c-th channel of
+            the input.
+        bias (Variable): 1D input of shape (C), the c-th element is the bias
+            of the affine transformation for the c-th channel of the input.
+        data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
+            tensor, you can ignore data_layout.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        out (Variable): A tensor of the same shape and data layout with x.
+    """
+    helper = LayerHelper("affine_channel", **locals())
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="affine_channel",
+        inputs={"X": x,
+                'Scale': scale,
+                'Bias': bias},
+        attrs={"data_layout": data_layout},
+        outputs={"Out": out})
+    return out
+
+
+def hash(input, hash_size, num_hash=1, name=None):
+    """
+    hash the input
+     Args:
+        input (Variable): The input variable which is a one-hot word.
+        hash_size (int): The space size for hash algorithm.
+        num_hash (int): The times of hash, default 1.
+        name (str, default None): The name of this layer.
+     Returns:
+        Variable: The hash result variable which is a LoDTensor.
+     Examples:
+        .. code-block:: python
+            word_dict = paddle.dataset.imdb.word_dict()
+            x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
+            out = fluid.layers.hash(input=x, len(word_dict))
+    """
+    helper = LayerHelper('hash', **locals())
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='hash',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'num_hash': num_hash,
+               'mod_by': hash_size})
+    return out
+
+
+def log_loss(input, label, epsilon=1e-4, name=None):
+    """
+    **Negative Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+
+    .. math::
+
+        Out = -label * \\log{(input + \\epsilon)}
+              - (1 - label) * \\log{(1 - input + \\epsilon)}
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        epsilon (float): epsilon
+        name (string): the name of log_loss
+
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the negative log loss.
+
+    Examples:
+        .. code-block:: python
+
+          prob = fluid.layers.sigmoid(net)
+          cost = fluid.layers.log_loss(input=prob, label=label)
+    """
+    helper = LayerHelper('log_loss', **locals())
+
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
+
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+
+
+def add_position_encoding(input, alpha, beta, name=None):
+    """
+    **Add Position Encoding Layer**
+
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    output Tensor of shape [N x M x P] with positional encoding value.
+
+    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+
+    .. math::
+        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+
+    Where:
+    * PE(pos, 2i): the increment for the number at even position
+    * PE(pos, 2i + 1): the increment for the number at odd position
+
+    Args:
+        input (Variable): 3-D input tensor with shape [N x M x P]
+        alpha (float): multiple of Input Tensor
+        beta (float): multiple of Positional Encoding Tensor
+        name (string): the name of position encoding layer
+
+    Returns:
+        Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
+
+    Examples:
+        .. code-block:: python
+
+          position_tensor = fluid.layers.add_position_encoding(input=tensor)
+    """
+    helper = LayerHelper('add_position_encoding', **locals())
+    dtype = helper.input_dtype()
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha,
+               "beta": beta})
+    return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 9c6a2112a69637aad4baff9a0296330620e3c52f..09a7cb8dc9339afa666f8cf09e92a27ffba8a9b3 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -152,7 +152,7 @@ def cast(x, dtype):
             result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='cast',
         inputs={'X': [x]},
@@ -184,7 +184,7 @@ def concat(input, axis=0, name=None):
            out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='concat',
         inputs={'X': input},
@@ -221,7 +221,8 @@ def sums(input, out=None):
     """
     helper = LayerHelper('sum', **locals())
     if out is None:
-        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='sum',
         inputs={'X': input},
@@ -252,7 +253,7 @@ def assign(input, output=None):
     """
     helper = LayerHelper('assign', **locals())
     if output is None:
-        output = helper.create_tmp_variable(dtype=input.dtype)
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
     if isinstance(input, Variable):
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
@@ -311,7 +312,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
 
     helper = LayerHelper("fill_constant", **locals())
     if out is None:
-        out = helper.create_tmp_variable(dtype=dtype)
+        out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='fill_constant',
         inputs={},
@@ -358,7 +359,7 @@ def fill_constant_batch_size_like(input,
         ${out_comment}.
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='fill_constant_batch_size_like',
         inputs={'Input': input},
@@ -396,7 +397,7 @@ def argmin(x, axis=0):
           out = fluid.layers.argmin(x=in, axis=-1)
     """
     helper = LayerHelper("arg_min", **locals())
-    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
     helper.append_op(
         type='arg_min',
         inputs={'X': x},
@@ -427,7 +428,7 @@ def argmax(x, axis=0):
           out = fluid.layers.argmax(x=in, axis=-1)
     """
     helper = LayerHelper("arg_max", **locals())
-    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
     helper.append_op(
         type='arg_max',
         inputs={'X': x},
@@ -477,8 +478,10 @@ def argsort(input, axis=-1, name=None):
             out, indices = fluid.layers.argsort(input, axis=0)
     """
     helper = LayerHelper("argsort", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True)
-    ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=input.dtype, stop_gradient=True)
+    ids = helper.create_variable_for_type_inference(
+        VarDesc.VarType.INT64, stop_gradient=True)
     helper.append_op(
         type='argsort',
         inputs={'X': input},
@@ -562,7 +565,7 @@ def reverse(x, axis):
     if isinstance(axis, int):
         axis = [axis]
     helper = LayerHelper("reverse", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reverse',
         inputs={'Input': x},
@@ -654,7 +657,7 @@ def has_inf(x):
         Variable: The tensor variable storing the output, only a bool value.
     """
     helper = LayerHelper("isinf", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -670,7 +673,7 @@ def has_nan(x):
         Variable: The tensor variable storing the output, only a bool value.
     """
     helper = LayerHelper("isnan", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -687,6 +690,6 @@ def isfinite(x):
         Variable: The tensor variable storing the output, contains a bool value.
     """
     helper = LayerHelper("isfinite", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
     return out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 0c2800dcf35ed156b71625babea2724f520575e5..f65b37903a35fa2bf9f2c2b2f169ce6fd4c478db 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 """
 Fluid Metrics
-
-The metrics are accomplished via Python natively.
 """
 
 from __future__ import print_function
@@ -24,6 +22,12 @@ import copy
 import warnings
 import six
 
+from .layer_helper import LayerHelper
+from .initializer import Constant
+from . import unique_name
+from .framework import Program, Variable, program_guard
+from . import layers
+
 __all__ = [
     'MetricBase',
     'CompositeMetric',
@@ -190,7 +194,7 @@ class CompositeMetric(MetricBase):
                                or soft-label, should custom the corresponding update rule.
         """
         for m in self._metrics:
-            ans.append(m.update(preds, labels))
+            m.update(preds, labels)
 
     def eval(self):
         """
@@ -474,71 +478,10 @@ class EditDistance(MetricBase):
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
             )
         avg_distance = self.total_distance / self.seq_num
-        avg_instance_error = self.instance_error / self.seq_num
+        avg_instance_error = self.instance_error / float(self.seq_num)
         return avg_distance, avg_instance_error
 
 
-class DetectionMAP(MetricBase):
-    """
-    Calculate the detection mean average precision (mAP).
-    mAP is the metric to measure the accuracy of object detectors
-    like Faster R-CNN, SSD, etc.
-    It is the average of the maximum precisions at different recall values.
-    Please get more information from the following articles:
-      https://sanchom.wordpress.com/tag/average-precision/
-
-      https://arxiv.org/abs/1512.02325
-
-    The general steps are as follows:
-
-        1. calculate the true positive and false positive according to the input
-            of detection and labels.
-        2. calculate mAP value, support two versions: '11 point' and 'integral'.
-
-    Examples:
-        .. code-block:: python
-
-            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
-            batch_map = layers.detection_map(
-                input,
-                label,
-                class_num,
-                background_label,
-                overlap_threshold=overlap_threshold,
-                evaluate_difficult=evaluate_difficult,
-                ap_version=ap_version)
-            metric = fluid.metrics.DetectionMAP()
-            for data in train_reader():
-                loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
-                batch_size = data[0]
-                metric.update(value=batch_map, weight=batch_size)
-                numpy_map = metric.eval()
-    """
-
-    def __init__(self, name=None):
-        super(DetectionMAP, self).__init__(name)
-        # the current map value
-        self.value = .0
-        self.weight = .0
-
-    def update(self, value, weight):
-        if not _is_number_or_matrix_(value):
-            raise ValueError(
-                "The 'value' must be a number(int, float) or a numpy ndarray.")
-        if not _is_number_(weight):
-            raise ValueError("The 'weight' must be a number(int, float).")
-        self.value += value
-        self.weight += weight
-
-    def eval(self):
-        if self.weight == 0:
-            raise ValueError(
-                "There is no data in DetectionMAP Metrics. "
-                "Please check layers.detection_map output has added to DetectionMAP."
-            )
-        return self.value / self.weight
-
-
 class Auc(MetricBase):
     """
     Auc metric adapts to the binary classification.
@@ -616,3 +559,179 @@ class Auc(MetricBase):
             idx -= 1
 
         return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+
+class DetectionMAP(object):
+    """
+    Calculate the detection mean average precision (mAP).
+
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+
+    Args:
+        input (Variable): The detection results, which is a LoDTensor with shape
+            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
+        gt_label (Variable): The ground truth label index, which is a LoDTensor
+            with shape [N, 1].
+        gt_box (Variable): The ground truth bounding box (bbox), which is a
+            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
+        class_num (int): The class number.
+        background_label (int): The index of background label, the background
+            label will be ignored. If set to -1, then all categories will be
+            considered, 0 by defalut.
+        overlap_threshold (float): The threshold for deciding true/false
+            positive, 0.5 by defalut.
+        evaluate_difficult (bool): Whether to consider difficult ground truth
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
+        ap_version (string): The average precision calculation ways, it must be
+            'integral' or '11point'. Please check
+            https://sanchom.wordpress.com/tag/average-precision/ for details.
+            - 11point: the 11-point interpolated average precision.
+            - integral: the natural integral of the precision-recall curve.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(place)
+            map_evaluator = fluid.Evaluator.DetectionMAP(input,
+                gt_label, gt_box, gt_difficult)
+            cur_map, accum_map = map_evaluator.get_map_var()
+            fetch = [cost, cur_map, accum_map]
+            for epoch in PASS_NUM:
+                map_evaluator.reset(exe)
+                for data in batches:
+                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+
+        In the above example:
+
+            'cur_map_v' is the mAP of current mini-batch.
+            'accum_map_v' is the accumulative mAP of one pass.
+    """
+
+    def __init__(self,
+                 input,
+                 gt_label,
+                 gt_box,
+                 gt_difficult=None,
+                 class_num=None,
+                 background_label=0,
+                 overlap_threshold=0.5,
+                 evaluate_difficult=True,
+                 ap_version='integral'):
+
+        self.helper = LayerHelper('map_eval')
+        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)
+
+        # calculate mean average precision (mAP) of current mini-batch
+        map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            ap_version=ap_version)
+
+        states = []
+        states.append(
+            self._create_state(
+                dtype='int32', shape=None, suffix='accum_pos_count'))
+        states.append(
+            self._create_state(
+                dtype='float32', shape=None, suffix='accum_true_pos'))
+        states.append(
+            self._create_state(
+                dtype='float32', shape=None, suffix='accum_false_pos'))
+        var = self._create_state(dtype='int32', shape=[1], suffix='has_state')
+        self.helper.set_variable_initializer(
+            var, initializer=Constant(value=int(0)))
+        self.has_state = var
+
+        # calculate accumulative mAP
+        accum_map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            has_state=self.has_state,
+            input_states=states,
+            out_states=states,
+            ap_version=ap_version)
+
+        layers.fill_constant(
+            shape=self.has_state.shape,
+            value=1,
+            dtype=self.has_state.dtype,
+            out=self.has_state)
+
+        self.cur_map = map
+        self.accum_map = accum_map
+
+    def _create_state(self, suffix, dtype, shape):
+        """
+        Create state variable.
+        Args:
+            suffix(str): the state suffix.
+            dtype(str|core.VarDesc.VarType): the state data type
+            shape(tuple|list): the shape of state
+        Returns: State variable
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name.generate(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        return state
+
+    def get_map_var(self):
+        """
+        Returns: mAP variable of current mini-batch and
+            accumulative mAP variable cross mini-batches.
+        """
+        return self.cur_map, self.accum_map
+
+    def reset(self, executor, reset_program=None):
+        """
+        Reset metric states at the begin of each pass/user specified batch.
+
+        Args:
+            executor(Executor): a executor for executing
+                the reset_program.
+            reset_program(Program|None): a single Program for reset process.
+                If None, will create a Program.
+        """
+
+        def _clone_var_(block, var):
+            assert isinstance(var, Variable)
+            return block.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=var.persistable)
+
+        if reset_program is None:
+            reset_program = Program()
+        with program_guard(main_program=reset_program):
+            var = _clone_var_(reset_program.current_block(), self.has_state)
+            layers.fill_constant(
+                shape=var.shape, value=0, dtype=var.dtype, out=var)
+        executor.run(reset_program)
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 1dabad54f5b976e0fcabf6918d3bc6ece4eed384..00d33b36fcc3266bf7f08020052d28172665e53e 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -64,23 +64,33 @@ def simple_img_conv_pool(input,
             average-pooling. Default :math:`max`.
         global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
             pool_size and pool_padding while be ignored. Default False
-        conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
+        conv_stride (int|list|tuple): The stride size of the conv2d Layer. If stride is a
             list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
             the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
-        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+        conv_padding (int|list|tuple): The padding size of the conv2d Layer. If padding is
             a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
             Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
-        conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
+        conv_dilation (int|list|tuple): The dilation size of the conv2d Layer. If dilation is
             a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
             Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
-        conv_groups (int): The groups number of the Conv2d Layer. According to grouped
+        conv_groups (int): The groups number of the conv2d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
-        act (str): Activation type for Conv2d. Default: None
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.
+            Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        act (str): Activation type for conv2d, if it is set to None, activation is not
+            appended. Default: None.
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
 
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 667db10d3ebdd24ddd9efbe2310ebb331e268ee2..4e1d1450dea85fe4eb3e68713250836e4beac992 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -120,6 +120,8 @@ class OpDescCreationMethod(object):
                     new_attr.strings.extend(user_defined_attr)
                 elif attr.type == framework_pb2.BOOLEANS:
                     new_attr.bools.extend(user_defined_attr)
+                elif attr.type == framework_pb2.LONGS:
+                    new_attr.longs.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
                         pair = new_attr.int_pairs.add()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index ed1784bd27697ed8acf235fb3a72cc201cc99bc6..7e2364a5a872cdd8cf590438cc081ab070db767d 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,8 +14,9 @@
 
 from __future__ import print_function
 import re
+import sys
 from collections import defaultdict
-from paddle.fluid.framework import Program, Variable, name_scope
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from . import framework
 from . import layers
 from .backward import append_backward
@@ -32,7 +33,8 @@ __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
+    'LarsMomentumOptimizer'
 ]
 
 
@@ -105,13 +107,15 @@ class Optimizer(object):
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
         if type(param_lr) == Variable:
-            print("returns updated param lr ", param_lr)
             return param_lr
         else:
             if param_lr == 1.0:
                 return self._global_learning_rate()
             else:
-                return self._global_learning_rate() * param_lr
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -397,6 +401,91 @@ class MomentumOptimizer(Optimizer):
         return momentum_op
 
 
+class LarsMomentumOptimizer(Optimizer):
+    """
+    Momentum optimizer with LARS support
+
+    The update equations are as follows:
+
+    .. math::
+
+        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
+          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
+
+        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
+
+        & param = param - velocity
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        momentum (float): momentum factor
+        lars_coeff (float): defines how much we trust the layer to change its weights.
+        lars_weight_decay (float): weight decay coefficient for decaying using LARS.
+        regularization: A Regularizer, such as
+                        fluid.regularizer.L2DecayRegularizer.
+        name: A optional name prefix.
+        
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.LarsMomentum(learning_rate=0.2, momentum=0.1, lars_weight_decay=0.001)
+            optimizer.minimize(cost)
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 lars_coeff=0.001,
+                 lars_weight_decay=0.0005,
+                 regularization=None,
+                 name=None):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(LarsMomentumOptimizer, self).__init__(
+            learning_rate=learning_rate,
+            regularization=regularization,
+            name=name)
+        self.type = "lars_momentum"
+        self._momentum = momentum
+        self._lars_coeff = float(lars_coeff)
+        self._lars_weight_decay = float(lars_weight_decay)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={
+                "mu": self._momentum,
+                "lars_coeff": self._lars_coeff,
+                "lars_weight_decay": self._lars_weight_decay
+            })
+
+        return momentum_op
+
+
 class AdagradOptimizer(Optimizer):
     """
     **Adaptive Gradient Algorithm (Adagrad)**
@@ -601,7 +690,8 @@ class AdamOptimizer(Optimizer):
         for param, grad in param_and_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
@@ -739,7 +829,8 @@ class AdamaxOptimizer(Optimizer):
         for param, grad in parameters_and_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamx'):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 main_block.append_op(
@@ -1216,6 +1307,7 @@ DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
 Ftrl = FtrlOptimizer
+LarsMomentum = LarsMomentumOptimizer
 
 
 class ModelAverage(Optimizer):
@@ -1278,7 +1370,8 @@ class ModelAverage(Optimizer):
         for param, grad in self.params_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('move_average'):
                 self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index a4336e955f21b0b09bf3dadbd437855c06745860..57185da4d1d38f3848994aae105411cf2844843a 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
         if grad is None:
             params_and_grads.append((param, grad))
             continue
-        with param.block.program._optimized_guard([param, grad]):
+        with param.block.program._optimized_guard(
+            [param, grad]), framework.name_scope('regularization'):
             regularization_term = None
             if param.regularizer is not None:
                 # Add variable for regularization term in grad block
@@ -151,7 +152,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
-                type=core.VarDesc.VarType.SELECTED_ROWS)
+                type=core.VarDesc.VarType.LOD_TENSOR)
             block.append_op(
                 type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
@@ -228,7 +229,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
-                type=core.VarDesc.VarType.SELECTED_ROWS)
+                type=core.VarDesc.VarType.LOD_TENSOR)
             block.append_op(
                 type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
@@ -237,6 +238,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
                         'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
+            param = decay
 
         # Append sign op
         block.append_op(
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 1885dda44ab5eaeca6a4f54e4b84379c71ec3167..d24417bbacb503d9ea70e68e7e0edb59e7dddbde 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -1,4 +1,3 @@
-set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory")
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..91c1d17eb5391ea37a41a886594cc71c6e6c56bd 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
@@ -1,7 +1,19 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
+if(NOT APPLE)
+    # default test
+    foreach(src ${TEST_OPS})
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_image_classification_vgg")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_image_classification_resnet")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 56129641ce5900d82aedf243d2fa1eadfd6b8d86..28dc7519571d8b5464e92fddf634ba46691ceaa9 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -301,7 +301,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -313,15 +313,18 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_straddle_thresh=0.0,
                 rpn_fg_fraction=0.5,
                 rpn_positive_overlap=0.7,
-                rpn_negative_overlap=0.3)
+                rpn_negative_overlap=0.3,
+                use_random=False)
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
             self.assertIsNotNone(tgt_lbl)
             self.assertIsNotNone(tgt_bbox)
+            self.assertIsNotNone(bbox_inside_weight)
             assert pred_scores.shape[1] == 1
             assert pred_loc.shape[1] == 4
             assert pred_loc.shape[1] == tgt_bbox.shape[1]
+            print(str(program))
 
 
 class TestGenerateProposals(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 7de0ebce06e9de439d3570bee9ac7dbce33ee868..2e87d8f4b4fa07773f205fd0a2151095a2353fc6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
     LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -78,9 +83,9 @@ if(WITH_DISTRIBUTE)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
         set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        # TODO: fix this test
-        #py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-        #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add this back
+	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
-py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+if(NOT APPLE)
+    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 877d21ae882ab4efb49beb6a846ab71a22c2aab7..1cda2711f765622b0bda6f4c688f69352bbd2a6f 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -90,12 +90,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
-        opt = fluid.optimizer.AdamOptimizer(
-            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        # TODO(typhoonzero): fix distributed adam optimizer
+        # opt = fluid.optimizer.AdamOptimizer(
+        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
+        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
 
         # Reader
         train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+            paddle.dataset.mnist.test(), batch_size=batch_size)
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size)
         opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..d386e75fd887a898f5a13e48e378e08ff6c99ea0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+
+
+def test_merge_reader(repeat_batch_size=8):
+    orig_reader = paddle.dataset.mnist.test()
+    record_batch = []
+    b = 0
+    for d in orig_reader():
+        if b >= repeat_batch_size:
+            break
+        record_batch.append(d)
+        b += 1
+    while True:
+        for d in record_batch:
+            yield d
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+
+        # Reader
+        train_reader = paddle.batch(test_merge_reader, batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
new file mode 100644
index 0000000000000000000000000000000000000000..977e17c37f7676ae81d9ab29b6b36089ccbeeacf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.LarsMomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc60550058f53da456c21de4b41142b907743df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import signal
+import subprocess
+import argparse
+import time
+import math
+import random
+from multiprocessing import Process
+from functools import reduce
+
+import numpy as np
+import unittest
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import io
+
+from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
+from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
+
+
+class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
+    def _load_persistable_vars(self, executor, dirname, program):
+        def _is_checkpoint_var(var):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.RAW:
+                return False
+            # @GRAD are named for gradient variables, checkpoint will not save it.
+            if "@GRAD" in var.name:
+                return False
+            # .trainer_ are named for distribute train variables, checkpoint will not save it.
+            if ".trainer_" in var.name:
+                return False
+
+            # .block is named for distribute train variables, checkpoint will not save it.
+            if ".block" in var.name:
+                return False
+
+            if "tmp_" in var.name:
+                return False
+
+            return var.persistable
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var,
+            filename=None)
+
+    def run_pserver(self, args):
+        self.get_model(batch_size=2)
+        # NOTE: pserver should not call memory optimize
+        t = self.get_transpiler(args.trainer_id,
+                                fluid.default_main_program(), args.endpoints,
+                                args.trainers, args.sync_mode)
+        pserver_prog = t.get_pserver_program(args.current_endpoint)
+        startup_prog = t.get_startup_program(args.current_endpoint,
+                                             pserver_prog)
+
+        need_load = bool(int(os.getenv("LOAD", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+
+        if need_load and model_dir:
+            self._load_persistable_vars(exe, model_dir, startup_prog)
+        exe.run(pserver_prog)
+
+    def run_trainer(self, args):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=2)
+
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
+        if args.is_dist:
+            t = self.get_transpiler(args.trainer_id,
+                                    fluid.default_main_program(),
+                                    args.endpoints, args.trainers,
+                                    args.sync_mode)
+
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+
+        build_stra = fluid.BuildStrategy()
+
+        if args.use_reduce:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        else:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        exe = fluid.ParallelExecutor(
+            args.use_cuda,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            build_strategy=build_stra)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.values()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.is_dist and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        need_save = bool(int(os.getenv("SAVE", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        if need_save:
+            for _ in six.moves.xrange(RUN_STEP):
+                loss, = exe.run(fetch_list=[avg_cost.name],
+                                feed=feeder.feed(get_data()))
+            if need_save and model_dir:
+                io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
+        print(np.ravel(var).tolist())
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSaveLoad2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index 6456d1b53a129db04ace7ff4413a3d76e922ccde..fac5e037a46715d146e354825f09ee8ccc4f3d70 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -81,7 +81,10 @@ def get_optimizer():
     return optimizer
 
 
-def train_network(batch_size, is_distributed=False, is_sparse=False):
+def train_network(batch_size,
+                  is_distributed=False,
+                  is_sparse=False,
+                  is_self_contained_lr=False):
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
     def get_model(self, batch_size=2):
         # Train program
         avg_cost, acc, predict = \
-            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+            train_network(batch_size,
+                          bool(int(os.environ["IS_DISTRIBUTED"])),
+                          bool(int(os.environ["IS_SPARSE"])),
+                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
 
         inference_program = fluid.default_main_program().clone()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index a2cc57425841100a2b61279d1b447b88ed4b9a54..27c67edf4f62dd3c5d396826348f8da4513667ba 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -35,7 +35,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
-from test_dist_base import TestDistRunnerBase, runtime_main
+from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
 import paddle.compat as cpt
 from paddle.compat import long_type
 
@@ -562,18 +562,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
     for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
-            if batch_id >= 5:
+            if batch_id >= RUN_STEP:
                 break
 
             feed_list = []
             total_num_token = 0
 
-            #if TrainTaskConfig.local:
-            #    lr_rate = lr_scheduler.update_learning_rate()
-            #for place_id, data_buffer in enumerate(
-            #        split_data(
-            #            data, num_part=dev_count)):
-
             if TrainTaskConfig.local:
                 lr_rate = lr_scheduler.update_learning_rate()
 
@@ -619,12 +613,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
             init = True
 
             # Validate and save the model for inference.
-            if batch_id == 0 or batch_id == 4:
-                if TrainTaskConfig.val_file_pattern is not None:
-                    val_avg_cost, val_ppl = test()
-                    print("[%f]" % val_avg_cost)
-                else:
-                    assert (False)
+            if TrainTaskConfig.val_file_pattern is not None:
+                val_avg_cost, val_ppl = test()
+                print("[%f]" % val_avg_cost)
+            else:
+                assert (False)
 
 
 #import transformer_reader as reader
@@ -1166,6 +1159,7 @@ def prepare_encoder(src_word,
             name=pos_enc_param_name,
             trainable=False,
             initializer=fluid.initializer.ConstantInitializer(0.001)))
+    src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
     return layers.dropout(
         enc_input,
@@ -1701,7 +1695,7 @@ class DistTransformer2x2(TestDistRunnerBase):
 
     def run_trainer(self, args):
         TrainTaskConfig.use_gpu = args.use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
             args.is_dist, not args.sync_mode)
 
         if args.is_dist:
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ee291fe746f3a1b6ce18df9fb6aa174a89e2eadd..a3fe5e0a0591c8da787e3c2fdb030f3912548316 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_reduce=False,
                                   fuse_elewise_add_act_ops=False,
                                   optimizer=fluid.optimizer.Adam,
-                                  use_fast_executor=False):
+                                  use_fast_executor=False,
+                                  enable_sequential_execution=False):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.enable_sequential_execution = enable_sequential_execution
 
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
diff --git a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f2a33793028f0883ffe94dd8a32626ad5c0351c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
@@ -0,0 +1,134 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestAddPositionEncodingTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingOp
+    """
+
+    def setUp(self):
+        """
+        the prepared section for add position encoding op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
+        self.out = np.copy(self.x)
+
+        batch_size = self.x.shape[0]
+        max_length = self.x.shape[1]
+        enc_size = self.x.shape[2]
+
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    self.out[i, j, k] = \
+                        self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[i, j, half_shape + k] = \
+                        self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
+
+
+class TestAddPositionEncodingLoDTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingLoDTensorOp
+    """
+
+    def setUp(self):
+        """
+        the prepared section for add position encoding LoDTensor op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+
+        self.inputs = {'X': (self.x, self.lod), }
+        self.outputs = {'Out': (self.out, self.lod)}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
+        self.lod = [[3, 7]]
+        self.out = np.copy(self.x)
+
+        batch_size = len(self.lod[0])
+        enc_size = self.x.shape[1]
+
+        start = 0
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            max_length = self.lod[0][i]
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    pos = start + j
+                    self.out[pos, k] = \
+                        self.x[pos, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[pos, half_shape + k] = \
+                        self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
+            start += max_length
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9a063e6ee75371e0d05e1ff6964753017881a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def affine_channel(x, scale, bias, layout):
+    C = x.shape[1] if layout == 'NCHW' else x.shape[-1]
+    if len(x.shape) == 4:
+        new_shape = (1, C, 1, 1) if layout == 'NCHW' else (1, 1, 1, C)
+    else:
+        new_shape = (1, C)
+    scale = scale.reshape(new_shape)
+    bias = bias.reshape(new_shape)
+    return x * scale + bias
+
+
+class TestAffineChannelOp(OpTest):
+    def setUp(self):
+        self.op_type = "affine_channel"
+        self.init_test_case()
+
+        x = np.random.random(self.shape).astype("float32")
+        scale = np.random.random(self.C).astype("float32")
+        bias = np.random.random(self.C).astype("float32")
+
+        y = affine_channel(x, scale, bias, self.layout)
+
+        self.inputs = {'X': x, 'Scale': scale, 'Bias': bias}
+        self.attrs = {'data_layout': self.layout}
+        self.outputs = {'Out': y}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Scale', 'Bias'], 'Out')
+
+    def test_check_grad_stopgrad_dx(self):
+        self.check_grad(['Scale', 'Bias'], 'Out', no_grad_set=set('X'))
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
+
+    def init_test_case(self):
+        self.shape = [2, 32, 14, 14]
+        self.C = 32
+        self.layout = 'NCHW'
+
+
+class TestAffineChannelNHWC(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [2, 14, 14, 32]
+        self.C = 32
+        self.layout = 'NHWC'
+
+
+class TestAffineChannel2D(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [16, 64]
+        self.C = 64
+        self.layout = 'NCHW'
+
+
+class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [64, 128, 112, 112]
+        self.C = 128
+        self.layout = 'NCHW'
+
+    # since the gradient check is very slow in large shape, so skip check_grad
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_stopgrad_dx(self):
+        pass
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        pass
+
+
+class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape):
+    def init_test_case(self):
+        self.shape = [64, 112, 112, 512]
+        self.C = 512
+        self.layout = 'NHWC'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..576d00940c4c7a5e30af5550e14b674a73e7df11
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def AffineGrid(theta, size):
+    n = size[0]
+    w = size[3]
+    h = size[2]
+    h_idx = np.repeat(
+        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(
+        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+
+    ret = np.zeros([n, h * w, 2])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
+
+#    print ret.reshape([h * w, 2]).astype("float32")    
+    return ret.reshape([n, h, w, 2]).astype("float32")
+
+
+class TestAffineGridOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "affine_grid"
+        theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
+        theta = np.ones(self.theta_shape).astype("float32")
+        self.inputs = {'Theta': theta}
+        self.attrs = {"use_cudnn": True}
+        if self.dynamic_shape:
+            self.inputs['OutputShape'] = self.output_shape
+        else:
+            self.attrs['output_shape'] = self.output_shape
+        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['Theta'],
+            'Output',
+            no_grad_set=['OutputShape'],
+            max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = False
+
+
+class TestAffineGridOpCase1(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 04924bec057e301bfb342a62bb4c1e0b3c3aff4c..07814bc2571b380ec24c825615e3ef3d16e694be 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -22,14 +22,17 @@ import signal
 import subprocess
 import six
 import argparse
+import pickle
+import numpy as np
 
 import paddle.fluid as fluid
 
 RUN_STEP = 10
+DEFAULT_BATCH_SIZE = 2
 
 
 class TestDistRunnerBase(object):
-    def get_model(self, batch_size=2):
+    def get_model(self, batch_size=DEFAULT_BATCH_SIZE):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
@@ -48,8 +51,7 @@ class TestDistRunnerBase(object):
         return t
 
     def run_pserver(self, args):
-
-        self.get_model(batch_size=2)
+        self.get_model(batch_size=args.batch_size)
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
@@ -65,7 +67,7 @@ class TestDistRunnerBase(object):
 
     def run_trainer(self, args):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-            self.get_model(batch_size=2)
+            self.get_model(batch_size=args.batch_size)
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
@@ -92,6 +94,11 @@ class TestDistRunnerBase(object):
         strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
+        if args.batch_merge_repeat > 1:
+            pass_builder = build_stra._create_passes_from_strategy()
+            mypass = pass_builder.insert_pass(
+                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
+            mypass.set_int("num_repeats", args.batch_merge_repeat)
 
         if args.use_reduce:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
@@ -123,10 +130,15 @@ class TestDistRunnerBase(object):
             else:
                 return origin_batch
 
+        out_losses = []
         for _ in six.moves.xrange(RUN_STEP):
             loss, = exe.run(fetch_list=[avg_cost.name],
                             feed=feeder.feed(get_data()))
-            print(loss)
+            out_losses.append(loss[0])
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
 
 
 def runtime_main(test_class):
@@ -144,7 +156,10 @@ def runtime_main(test_class):
     parser.add_argument('--use_cuda', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument(
-        '--use_reader_alloc', action='store_true', required=False, default=True)
+        '--use_reader_alloc', action='store_true', required=False)
+    parser.add_argument('--batch_size', required=False, type=int, default=2)
+    parser.add_argument(
+        '--batch_merge_repeat', required=False, type=int, default=1)
 
     args = parser.parse_args()
 
@@ -180,7 +195,7 @@ class TestDistBase(unittest.TestCase):
         self._pservers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
             self._find_free_port(), self._find_free_port())
-        self._python_interp = "python"
+        self._python_interp = sys.executable
         self._sync_mode = True
         self._enforce_place = None
         self._mem_opt = False
@@ -229,24 +244,18 @@ class TestDistBase(unittest.TestCase):
 
         return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
-    def _wait_ps_ready(self, pid):
-        retry_times = 50
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error as e:
-                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
-                                 (e, retry_times))
-                retry_times -= 1
-
-    def _run_local(self, model, envs, check_error_log):
+    def _run_local(self,
+                   model,
+                   envs,
+                   check_error_log=False,
+                   batch_size=DEFAULT_BATCH_SIZE,
+                   batch_merge_repeat=1):
 
         cmd = "%s %s --role trainer" % (self._python_interp, model)
+        if batch_size != DEFAULT_BATCH_SIZE:
+            cmd += " --batch_size %d" % batch_size
+        if batch_merge_repeat > 1:
+            cmd += " --batch_merge_repeat %d" % batch_merge_repeat
 
         if self.__use_cuda:
             cmd += " --use_cuda"
@@ -271,23 +280,20 @@ class TestDistBase(unittest.TestCase):
                 env=envs)
 
         local_out, local_err = local_proc.communicate()
-        local_ret = cpt.to_text(local_out)
 
         if check_error_log:
             err_log.close()
 
-        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
         sys.stderr.write('local_stderr: %s\n' % local_err)
 
-        local_losses = local_ret.split("\n")
-        return local_losses
+        return pickle.loads(local_out)
 
     def _run_cluster(self, model, envs, check_error_log):
         # Run dist train to compare with local results
         ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
                                                           check_error_log, envs)
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
+
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
         tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
@@ -322,8 +328,8 @@ class TestDistBase(unittest.TestCase):
         env0.update(envs)
         env1.update(envs)
 
-        print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
-        print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
+        print("tr0_cmd:{}".format(tr0_cmd))
+        print("tr1_cmd:{}".format(tr1_cmd))
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
@@ -339,9 +345,7 @@ class TestDistBase(unittest.TestCase):
             env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
-        tr0_loss_text = cpt.to_text(tr0_out)
         tr1_out, tr1_err = tr1_proc.communicate()
-        tr1_loss_text = cpt.to_text(tr1_out)
 
         # close trainer file
         tr0_pipe.close()
@@ -356,15 +360,13 @@ class TestDistBase(unittest.TestCase):
         ps1.terminate()
 
         # print log
-        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
-        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
         sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
 
-        tr0_losses = tr0_loss_text.split("\n")
-        tr1_losses = tr1_loss_text.split("\n")
-
-        return tr0_losses, tr1_losses
+        # return tr0_losses, tr1_losses
+        return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
     def check_with_place(self,
                          model_file,
@@ -394,9 +396,9 @@ class TestDistBase(unittest.TestCase):
                                                    check_error_log)
 
         for step_id in range(RUN_STEP):
-            local_loss = eval(local_losses[step_id])[0]
-            tr0_loss = eval(tr0_losses[step_id])[0]
-            tr1_loss = eval(tr1_losses[step_id])[0]
-            dist_loss = (tr0_loss + tr1_loss) / 2
-            print(str(local_loss) + ":" + str(dist_loss))
-            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
+            local_loss = local_losses[step_id]
+            tr0_loss = tr0_losses[step_id]
+            tr1_loss = tr1_losses[step_id]
+            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
+            print("=======", local_loss, ":", dist_loss[0], "=======")
+            self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
index 3575fd07fc727bd6c6b07a19a60b1df6656ae9e2..b2d979729bc9b2546375cb657f78abe0d8c2dcc7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -18,14 +18,14 @@ import unittest
 from test_dist_base import TestDistBase
 
 
+# FIXME(tangwei): sum op can not handle when inputs is empty.
 class TestDistCTR2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
 
-
-def test_dist_ctr(self):
-    self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+    def test_dist_ctr(self):
+        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb..922dd838f8996adfc15afffcd44c1acca2bc14a9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -26,6 +26,15 @@ class TestDistMnist2x2(TestDistBase):
         self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnist2x2Lars(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist_lars.py", delta=1e-5)
+
+
 class TestDistMnist2x2WithMemopt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d4b7929033529c5cea60064e6d9de57eddeb8e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import os
+
+
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_dist_train(self):
+        self.check_with_place("dist_mnist_batch_merge.py", delta=1e-5)
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        no_merge_losses = self._run_local(
+            model_file,
+            required_envs,
+            check_error_log=check_error_log,
+            batch_size=4)
+
+        batch_merge_losses = self._run_local(
+            model_file,
+            required_envs,
+            check_error_log=check_error_log,
+            batch_size=2,
+            batch_merge_repeat=2)
+        # Ensure both result have values.
+        self.assertGreater(len(no_merge_losses), 1)
+        self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..03066fee48b703f8b55bd4ae6a9c4bb8deecab1e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import shutil
+import unittest
+import tempfile
+
+import numpy as np
+
+from test_dist_base import TestDistBase, RUN_STEP
+
+
+class TestDistSaveLoadDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        local_env = {}
+        local_env["SAVE"] = "1"
+        local_env["MODEL_DIR"] = model_dir
+        local_env.update(required_envs)
+
+        cluster_env = {}
+        cluster_env["LOAD"] = "1"
+        cluster_env["MODEL_DIR"] = model_dir
+        cluster_env.update(required_envs)
+
+        local_var = self._run_local(model_file, local_env, check_error_log)
+        tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
+                                             check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        local_np = np.array(eval(local_var[0]))
+        train0_np = np.array(eval(tr0_var[0]))
+        train1_np = np.array(eval(tr1_var[0]))
+        self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
+        self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
+        self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
+
+    @unittest.skip(reason="CI fail")
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index c0989ca709e100d8f147a08970b0e858c81ce09b..c2a4e5ca0c050813785f602c5d2088466e616971 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
         self._use_reader_alloc = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
 class TestDistseResnXt2x2WithMemopt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
+        self._use_reader_alloc = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
 class TestDistSeResneXt2x2Async(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
index e971f29db42a7c1a2394505a8ece3d2fd6b347e9..102a4dab05fe1adc6a503920714f50415b29dc19 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -38,8 +42,12 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
         self._sync_mode = False
         self._enforce_place = "CPU"
 
-    def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+    def no_test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -67,7 +79,48 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+# FIXME(tangwei): Learningrate variable is not created on pserver.
+class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=True,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -75,5 +128,23 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
             need_envs=need_envs)
 
 
+class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '0'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 47e8dfaf03ceb27a74f5e48d662d2b534d2d152b..25dcccc28d710695d4c5e08c17816669d0fae5d8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -61,7 +61,8 @@ class TestDistTransformer2x2Sync(TestDistBase):
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place("dist_transformer.py", delta=1e-5)
+        self.check_with_place(
+            "dist_transformer.py", delta=1e-5, check_error_log=False)
 
 
 class TestDistTransformer2x2Async(TestDistBase):
@@ -70,7 +71,8 @@ class TestDistTransformer2x2Async(TestDistBase):
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place("dist_transformer.py", delta=1.0)
+        self.check_with_place(
+            "dist_transformer.py", delta=1.0, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 54a1c68a37f6929890aab697b48d621e6effb7d8..986fdd9ff27fe2be54ce97f330028b4ae2358714 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
         trainer, _ = self.get_trainer()
 
 
+class TestFtrl(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        opt = fluid.optimizer.Ftrl(learning_rate=0.1)
+        opt.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
+
+
 class TestLRDecayConditional(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             ["sum", "scale", "scale", "elementwise_add", "momentum"])
 
 
+class TestEmptyPserverOptimizeBlocks(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        # only one parameter
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=False)
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
+        sgd_optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.slice_var_up = False
+
+        pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config)
+
+        self.assertEqual(len(pserver.blocks), 2)
+        self.assertEqual(len(pserver.blocks[1].ops), 0)
+
+
 class TestDistLookupTableBase(TranspilerTest):
     def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
         self.lookup_table_name = 'shared_w'
 
-        def emb_pool(ids):
+        def emb_pool(ids, table_name, is_distributed):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
-                param_attr=self.lookup_table_name,  # share parameter
+                param_attr=table_name,
                 is_sparse=is_sparse,
                 is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
             name='title_ids', shape=[1], dtype='int64', lod_level=1)
         brand_ids = fluid.layers.data(
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        title_emb = emb_pool(title_ids)
-        brand_emb = emb_pool(brand_ids)
-        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        profile_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
+        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
+        profile_emb = emb_pool(profile_ids, "profile_emb", False)
+        fc0 = fluid.layers.concat(
+            input=[title_emb, brand_emb, profile_emb], axis=1)
         predict = fluid.layers.fc(input=fc0,
                                   size=2,
                                   act=None,
@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
     def transpiler_test_impl(self):
         pserver1, startup1 = self.get_pserver(self.pserver1_ep)
 
-        self.assertEqual(len(pserver1.blocks), 3)
+        self.assertEqual(len(pserver1.blocks), 4)
         # 0 listen_and_serv
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["sum", "scale", "adam", "scale", "scale"])
 
+        # 3 optimize for table 2 adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
         trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
-            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
+            'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -485,31 +540,43 @@ class TestDistLookupTable(TestDistLookupTableBase):
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
                          ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table sgd
+        # 4 prefetch -> lookup_sparse_table for data0
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
                          ["sum", "sgd"])
         # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["lookup_sparse_table"])
-        # 4 prefetch -> lookup_sparse_table for data1
         self.assertEqual([op.type for op in pserver1.blocks[4].ops],
                          ["lookup_sparse_table"])
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer, _ = self.get_trainer()
+        trainer, trainer_startup = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
-            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
+            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
             'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
             'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
-            'fetch_barrier'
+            'lookup_table_grad', 'split_selected_rows', 'send',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
+            'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+        startup_ops = [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'uniform_random',
+            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
+            'fake_init'
+        ]
+        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
+                         startup_ops)
 
 
 class TestAsyncLocalLookupTable(TestDistLookupTableBase):
@@ -520,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         config = fluid.DistributeTranspilerConfig()
         pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
 
-        self.assertEqual(len(pserver1.blocks), 3)
+        self.assertEqual(len(pserver1.blocks), 4)
         # 0 listen_and_serv
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
@@ -529,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["adam", "scale", "scale"])
+        # 3 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["adam", "scale", "scale"])
 
         trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
-            'recv', 'recv', 'concat'
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv',
+            'recv', 'concat', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -558,12 +631,12 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
                          ["adam", "scale", "scale"])
-        # 2 optimize for table sgd
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
-        # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["lookup_sparse_table"])
-        # 4 prefetch -> lookup_sparse_table for data1
+        # 2 optimize for table adam
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+        # 3 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"])
+        # 4 prefetch -> lookup_sparse_table for data0
         self.assertEqual([op.type for op in pserver1.blocks[4].ops],
                          ["lookup_sparse_table"])
         # 5 save table
@@ -572,13 +645,15 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
-            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
+            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
             'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
             'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_ids', 'send', 'recv', 'recv'
+            'lookup_table_grad', 'split_selected_rows', 'send',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv',
+            'recv', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 0296bc2af4e0b79478c34b4cceab32b5a8a50f2f..be3c5f3b9558ec522803ed9a5acedea75cda6ccc 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -85,6 +85,69 @@ class TestDropoutOp5(OpTest):
         self.check_output()
 
 
+class TestDropoutOp6(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 1.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('float32')
+        }
+
+
+class TestDropoutOp7(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('float32')
+        }
+
+
+class TestDropoutOp8(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.35,
+            'fix_seed': True,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDropoutOp9(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.75,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFP16DropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
diff --git a/python/paddle/fluid/tests/unittests/test_fake_init_op.py b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62b7aed66b59940b4ba654d98479e3e35c7b78b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestFakeInitOpSelectedRows(unittest.TestCase):
+    def check_with_place(self, place, is_selected_rows):
+        scope = core.Scope()
+
+        out_var_name = 'Out'
+        if is_selected_rows:
+            out_tensor = scope.var(out_var_name).get_selected_rows().get_tensor(
+            )
+        else:
+            out_tensor = scope.var(out_var_name).get_tensor()
+
+        var_shape = [4, 784]
+
+        # create and run fake_init_op
+        fake_init_op = Operator("fake_init", Out=out_var_name, shape=var_shape)
+        fake_init_op.run(scope, place)
+
+        self.assertEqual(var_shape, out_tensor._get_dims())
+
+    def test_fake_init_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            for is_selected_rows in [True, False]:
+                self.check_with_place(place, is_selected_rows)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e..377454e7802e40f90c371987adfe50cce922c764 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -125,6 +125,12 @@ class TestFusionGRUOpMD2(TestFusionGRUOp):
         self.D = 8
 
 
+class TestFusionGRUOpMD3(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 17
+        self.D = 15
+
+
 class TestFusionGRUOpBS1(TestFusionGRUOp):
     def set_confs(self):
         self.lod = [[3]]
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba6f1415b1c832eb688443953866652e3458b172
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import random
+from op_test import OpTest
+from test_seq_conv import seqconv
+
+
+class TestSeqConvEltAddRelu(OpTest):
+    def set_conf(self):
+        pass
+
+    def setUp(self):
+        self.op_type = 'fusion_seqconv_eltadd_relu'
+        self.lod = [[6, 4]]
+        self.in_fea_size = 16
+        self.out_fea_size = 8
+        self.context_length = 4
+        self.context_stride = 1
+        self.context_start = 0
+        self.set_conf()
+
+        assert self.context_stride == 1
+
+        T = sum(self.lod[0])
+        x = np.random.uniform(-1, 1, [T, self.in_fea_size]).astype('float32')
+        w = np.random.uniform(
+            -1, 1, [self.in_fea_size * self.context_length,
+                    self.out_fea_size]).astype('float32')
+        b = np.random.uniform(-2, 1, [1, self.out_fea_size]).astype('float32')
+        out = seqconv(x, self.lod, w, self.context_length, self.context_start)
+        out = np.maximum(out + b, 0)
+
+        self.inputs = {'X': (x, self.lod), 'Filter': w, 'Bias': b}
+        self.attrs = {
+            'contextStart': self.context_start,
+            'contextLength': self.context_length,
+            'contextStride': self.context_stride
+        }
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSeqConvEltAddReluBS1(TestSeqConvEltAddRelu):
+    def set_conf(self):
+        self.lod = [[10]]
+
+
+class TestSeqConvEltAddReluBS1Case2(TestSeqConvEltAddRelu):
+    def set_conf(self):
+        self.lod = [[2]]
+
+
+class TestSeqConvEltAddReluCase1(TestSeqConvEltAddRelu):
+    def set_conf(self):
+        self.lod = [[3, 5, 1, 6]]
+        self.context_length = 3
+        self.context_start = -2
+
+
+class TestSeqConvEltAddReluCase2(TestSeqConvEltAddRelu):
+    def set_conf(self):
+        self.lod = [[10, 1, 2, 4, 1, 5, 6]]
+        self.in_fea_size = 2
+        self.context_length = 4
+        self.context_start = -1
+
+
+class TestSeqConvEltAddReluCase3(TestSeqConvEltAddRelu):
+    def set_conf(self):
+        self.lod = [[10, 1, 2, 4, 1, 5, 6]]
+        self.context_length = 5
+        self.context_start = -4
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1130ea39c42204283885ab1072a52db8c22f8b2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "hash"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'num_hash': 4, 'mod_by': 10000}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def init_test_case(self):
+        np.random.seed = 1
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        #  self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
+        self.out_seq = [
+            [[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]],
+            [[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]],
+            [[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]],
+            [[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]],
+            [[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]],
+            [[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]],
+            [[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]],
+            [[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]],
+            [[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]],
+            [[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]],
+            [[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]]
+        ]
+        self.out_seq = np.array(self.out_seq)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d..8081813b71d6ecedd874046a95283587a15a004c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
         print(str(program))
 
+    def test_sequence_unpad(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10, 5], dtype='float32')
+            length = layers.data(name='length', shape=[1], dtype='int64')
+            self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
+        print(str(program))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):
@@ -406,6 +414,19 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_sequence_slice(self):
+        program = Program()
+        with program_guard(program):
+            import numpy as np
+            seqs = layers.data(
+                name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
+            length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
+            out = layers.sequence_slice(
+                input=seqs, offset=offset, length=length)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_lod_reset(self):
         program = Program()
         with program_guard(program):
@@ -444,6 +465,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_roi_align(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     def test_resize_bilinear(self):
         program = Program()
         with program_guard(program):
@@ -834,6 +865,22 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_affine_grid(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+
+            theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
+            out_shape = layers.data(
+                name="out_shape", shape=[-1], dtype="float32")
+            data_0 = layers.affine_grid(theta, out_shape)
+            data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
+
+            self.assertIsNotNone(data_0)
+            self.assertIsNotNone(data_1)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 48b52a5412eb99fbc7a5c8534a766ede4954e849..a0358f8b401e301312b5b9c0b18733d4275045e3 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     exe.run(pserver_prog)
 
 
+def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
+                                 trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    ps1 = ip + ":" + str(int(port) + 1)
+    ps2 = ip + ":" + port
+    pserver_endpoints = ps1 + "," + ps2
+
+    config = fluid.DistributeTranspilerConfig()
+    config.slice_var_up = False
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(ps2)
+
+    # pserver2 have no parameter
+    assert (len(pserver_prog.blocks) == 2)
+    assert (len(pserver_prog.blocks[1].ops) == 0)
+
+    pserver_startup = t.get_startup_program(ps2, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
 class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
         self.trainers = 1
         self.trainer_id = 0
 
-    def _start_pserver(self, use_cuda, sync_mode):
+    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
         p = Process(
-            target=run_pserver,
+            target=pserver_func,
             args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
         p.daemon = True
@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True)
+        p1 = self._start_pserver(False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+        # run pserver on CPU in async mode
+        p2 = self._start_pserver(False, False, run_pserver)
+        self._wait_ps_ready(p2.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p2.pid, signal.SIGTERM)
+        p2.join()
+
+    def test_list_and_serv_run_empty_optimize_block(self):
+        # run pserver on CPU in sync mode
+        p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
         self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
         p1.join()
 
         # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False)
+        p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
         self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
index 26ce7024117162e8bad403a9d8b8518c27578c83..b109e4ea62669c735128f4824eb9d02ad43900e0 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -22,15 +22,28 @@ from op_test import OpTest
 class TestMergeIdsOp(OpTest):
     def setUp(self):
         self.op_type = "merge_ids"
-        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
-        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
-        x1 = np.array([]).astype('float32')
-        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
-                       [0.5, 0.6]]).astype('float32')
-        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
-                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
-        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.outputs = {'Out': out}
+        ids1 = np.array([[0], [2], [5], [6]]).astype('int64')
+        ids2 = np.array([[0], [2], [2], [3]]).astype('int64')
+
+        rows1 = np.array([[0], [2]]).astype('int64')
+        rows2 = np.array([[3], [5]]).astype('int64')
+        rows3 = np.array([[6]]).astype('int64')
+
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3]]).astype('float32')
+        x1 = np.array([[0.3, 0.4], [0.4, 0.5]]).astype('float32')
+        x2 = np.array([[0.5, 0.6]]).astype('float32')
+
+        out1 = np.array(
+            [[0.1, 0.2], [0.2, 0.3], [0.4, 0.5], [0.5, 0.6]]).astype('float32')
+        out2 = np.array(
+            [[0.1, 0.2], [0.2, 0.3], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+
+        self.inputs = {
+            'Ids': [('ids1', ids1), ('ids2', ids2)],
+            "Rows": [('rows1', rows1), ('rows2', rows2), ('rows3', rows3)],
+            "X": [('x0', x0), ('x1', x1), ('x2', x2)]
+        }
+        self.outputs = {'Out': [('out1', out1), ('out2', out2)]}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec27884cae2b0462951f6597b1b83e58d1c8af5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_metrics.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+from paddle.fluid.framework import Program, program_guard
+
+
+class TestMetricsDetectionMap(unittest.TestCase):
+    def test_detection_map(self):
+        program = fluid.Program()
+        with program_guard(program):
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            box = fluid.layers.data(
+                name='bbox',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(
+                detect_res, label, box, class_num=21)
+            cur_map, accm_map = map_eval.get_map_var()
+            self.assertIsNotNone(cur_map)
+            self.assertIsNotNone(accm_map)
+        print(str(program))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 7137fd0fdb7c503492107da684b95989037eb872..cf4346cf2e7a099334ec273546901a91d0ad925d 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 from op_test import OpTest
 
 
@@ -88,5 +90,136 @@ class TestMomentumOp2(OpTest):
         self.check_output()
 
 
+class TestLarsMomentumOp(OpTest):
+    def setUp(self):
+        self.op_type = "lars_momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        lars_coeff = 0.001
+        lars_weight_decay = 0.0005
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {
+            'mu': mu,
+            'lars_coeff': lars_coeff,
+            'lars_weight_decay': lars_weight_decay
+        }
+
+        pnorm = np.sqrt(np.square(param).sum())
+        gnorm = np.sqrt(np.square(grad).sum())
+        local_lr = learning_rate * lars_coeff * pnorm / (
+            gnorm + lars_weight_decay * param)
+        velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay *
+                                                   param)
+        param_out = param - velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSparseMomentumOp(unittest.TestCase):
+    def setUp(self):
+        self.use_nesterov = False
+
+    def check_with_place(self, place):
+        self.init_kernel()
+        scope = core.Scope()
+        # create and initialize Grad Variable
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        mu = 1.0
+        use_nesterov = self.use_nesterov
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+        param_out = scope.var("ParamOut").get_tensor()
+        param_out_array = np.full((height, row_numel), 0.0).astype("float32")
+        param_out.set(param_out_array, place)
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
+        grad_np_array[0, 0] = 2.0
+        grad_np_array[2, 8] = 4.0
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_np_array, place)
+
+        velocity = scope.var('Velocity').get_tensor()
+        velocity_np_array = np.ones((height, row_numel)).astype("float32")
+        velocity.set(velocity_np_array, place)
+        velocity_out = scope.var('VelocityOut').get_tensor()
+        velocity_out_np_array = np.full((height, row_numel),
+                                        0.0).astype("float32")
+        velocity_out.set(velocity_out_np_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and run operator
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov)
+        op.run(scope, place)
+
+        # get and compare result
+        param_out_np_array = np.array(param_out)
+        velocity_out_np_array = np.array(velocity_out)
+
+        # TODO(dzh): add a more suitable general numpy interface
+        # for sparse update.
+        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
+        for i in range(len(rows)):
+            _grad_np_array[rows[i]] = grad_np_array[i]
+        _velocity_out = mu * velocity_np_array + _grad_np_array
+        _param = param_array
+        if use_nesterov:
+            _param_out = _param - (_grad_np_array + _velocity_out * mu
+                                   ) * lr_array
+        else:
+            _param_out = _param - lr_array * _velocity_out
+        self.assertTrue((_velocity_out == velocity_out_np_array).all())
+        self.assertTrue((_param_out == param_out_np_array).all())
+
+    def init_kernel(self):
+        pass
+
+    def test_sparse_momentum(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+class TestSparseMomentumOp2(TestSparseMomentumOp):
+    def init_kernel(self):
+        self.use_nesterov = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index cc2d692e18430eb48e6e800106eab0c3739d3f53..e7a56bb6386a812e43e5c1b5c08cd0682aa9223a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -232,6 +232,46 @@ class TestResnet(TestParallelExecutorBase):
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
             self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
 
+        if not use_cuda:
+            return
+
+        all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(reduce_first_loss, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(reduce_last_loss, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
     def _check_resnet_convergence(self,
                                   model,
                                   use_cuda=True,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index a55b2002ed989d4588716202a37aa6f4139825ea..3827743908c1d76931572277323d1dd5ddd05523 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -173,6 +173,8 @@ class TestTransformer(TestParallelExecutorBase):
     def test_main(self):
         if core.is_compiled_with_cuda():
             self.check_network_convergence(transformer, use_cuda=True)
+            self.check_network_convergence(
+                transformer, use_cuda=True, enable_sequential_execution=True)
         self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index dfedf8190f75ec26532f281338f076ca0c7d83af..7f266056a9d98be1a6f67473be65a74957f943e9 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -37,7 +37,7 @@ def PolygonBoxRestore(input):
     indexes = indexes.repeat(
         [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
     return indexes.reshape(
-        input.shape) - input  # [batch_size, geo_channels, h, w]
+        input.shape) * 4 - input  # [batch_size, geo_channels, h, w]
 
 
 class TestPolygonBoxRestoreOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..b913127ad625eb25de3ec36edd2161019ed09749
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+from threading import Thread
+
+
+def user_reader(inputs):
+    def _reader():
+        for d in inputs:
+            yield d
+
+    return _reader
+
+
+def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
+    def _feeder():
+        for batch_data in batch_reader():
+            sample_batch = []
+            label_batch = []
+            for sample, label in batch_data:
+                sample_batch.append(sample)
+                label_batch.append([label])
+            tensor = core.LoDTensor()
+            label = core.LoDTensor()
+            place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace()
+            tensor.set(np.array(sample_batch, dtype=img_dtype), place)
+            label.set(np.array(label_batch, dtype="int64"), place)
+            yield [tensor, label]
+
+    return _feeder
+
+
+class TestPyReader(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
+        self.lod_levels = [0, 0]
+        self.dtypes = ['float32', 'int64']
+
+    def test_pin_memory_pyreader(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            executor = fluid.Executor(place)
+
+            data_file = fluid.layers.py_reader(
+                capacity=self.capacity,
+                dtypes=self.dtypes,
+                lod_levels=self.lod_levels,
+                shapes=self.shapes)
+            # feed_queue = data_file.queue
+            read_out_data = fluid.layers.read_file(data_file)
+
+            self.inputs = []
+            for _ in range(10):
+                sample = np.random.uniform(
+                    low=0, high=1, size=[3, 2, 1]).astype("float32")
+                label = np.random.uniform(
+                    low=0, high=10, size=[1]).astype("int64")
+                self.inputs.append((sample, label))
+
+            self.input_tensors = []
+            for d, l in batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False)():
+                ta = fluid.LoDTensorArray()
+                ta.append(d)
+                ta.append(l)
+                self.input_tensors.append(ta)
+
+            self.batched_inputs = []
+            for batch in paddle.batch(user_reader(self.inputs), batch_size=2)():
+                feed_d = []
+                feed_l = []
+                for d, l in batch:
+                    feed_d.append(d)
+                    feed_l.append([l])
+                self.batched_inputs.append([feed_d, feed_l])
+
+            data_file.decorate_tensor_provider(
+                batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False))
+
+            executor.run(fluid.default_startup_program())
+            self.outputs = []
+
+            data_file.start()
+            for _ in self.input_tensors:
+                self.outputs.append(
+                    executor.run(fetch_list=list(read_out_data)))
+            data_file.reset()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.batched_inputs), len(self.outputs))
+        for in_data_list, out_data_list in zip(self.batched_inputs,
+                                               self.outputs):
+            self.assertEqual(len(in_data_list), len(out_data_list))
+            in_data_list_np = [
+                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
+            ]
+            for in_data, out_data in zip(in_data_list_np, out_data_list):
+                self.assertTrue((in_data == out_data).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 70848e4e2239e2be160bb0c1a28a5aecd01a87dc..eb12bc741767340a3e7e3580a8b95065d4267693 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -19,33 +19,76 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+
+
+def create_selected_rows_and_tensor(scope, place, height, row_num,
+                                    embedding_size):
+    sr = scope.var("@selected_rows@").get_selected_rows()
+    tensor = scope.var("grad").get_tensor()
+
+    rows = np.random.random_integers(
+        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
+
+    sr.set_height(height)
+    sr.set_rows(rows)
+    sr.get_tensor().set(sr_val, place)
+
+    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
+    for i in range(row_num):
+        row = rows[i]
+        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
+
+    tensor.set(tensor_val, place)
+    return tensor_val, sr_val
 
 
 class TestBase(unittest.TestCase):
-    def setup(self, centered, epsilon=1e-6):
+    def setup(self,
+              place,
+              is_sparse,
+              centered,
+              size,
+              row_num=None,
+              epsilon=1e-6):
         np.random.seed(5)  # fix seed
 
+        self.scope = fluid.global_scope()
+        self.place = place
+
         self.param_name = "param"
-        self.param = np.random.random((123, 321)).astype("float32")
+        self.param = np.random.random(size).astype("float32")
 
         self.mean_square_name = "mean_square"
-        self.mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square = np.random.uniform(
+            low=1, high=2, size=size).astype("float32")
 
         self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random((123, 321)).astype("float32")
+        self.mean_grad = np.random.random(size).astype("float32")
 
         self.lr_name = "lr"
         self.learning_rate = np.array([0.01]).astype("float32")
 
         self.grad_name = "grad"
-        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.is_sparse = is_sparse
+        if self.is_sparse:
+            self.grad_sr_name = "@selected_rows@"
+            self.grad, self.grad_sr = create_selected_rows_and_tensor(
+                self.scope, place, size[0], row_num, size[1])
+        else:
+            self.grad = np.random.random(size).astype("float32")
+            grad_tensor = self.scope.var(self.grad_name).get_tensor()
+            grad_tensor.set(self.grad, place)
 
         self.moment_name = "moment"
-        self.moment = np.zeros((123, 321)).astype("float32")
+        self.moment = np.random.uniform(
+            low=0, high=1, size=size).astype("float32")
 
         self.epsilon = epsilon
         self.decay = 0.9
-        self.momentum = 0.0
+        self.momentum = 0.1
         self.centered = centered
 
         self.ms_out = self.decay * self.mean_square + (1 - self.decay
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
 
         self.param_out = self.param - self.moment_out
 
-    def check(self,
-              actual_t,
-              expect_t,
-              place,
-              out_name,
-              atol=1e-5,
-              equal_nan=False):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self, place, centered, epsilon):
-        self.setup(centered, epsilon)
-        scope = core.Scope()
-
         # create and initialize Param Variable
-        param = scope.var(self.param_name).get_tensor()
-        param.set(self.param, place)
+        self.param_tensor = self.scope.var(self.param_name).get_tensor()
+        self.param_tensor.set(self.param, place)
 
-        mean_square = scope.var(self.mean_square_name).get_tensor()
-        mean_square.set(self.mean_square, place)
+        self.mean_square_tensor = self.scope.var(
+            self.mean_square_name).get_tensor()
+        self.mean_square_tensor.set(self.mean_square, place)
 
-        lr = scope.var(self.lr_name).get_tensor()
+        lr = self.scope.var(self.lr_name).get_tensor()
         lr.set(self.learning_rate, place)
 
-        grad = scope.var(self.grad_name).get_tensor()
-        grad.set(self.grad, place)
+        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
+        self.moment_tensor.set(self.moment, place)
 
-        moment = scope.var(self.moment_name).get_tensor()
-        moment.set(self.moment, place)
+        if self.centered:
+            self.mean_grad_tensor = self.scope.var(
+                self.mean_grad_name).get_tensor()
+            self.mean_grad_tensor.set(self.mean_grad, place)
 
-        # create and run sgd operator
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
-        if self.centered:
-            mean_grad = scope.var(self.mean_grad_name).get_tensor()
-            mean_grad.set(self.mean_grad, place)
-
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                MeanGrad=self.mean_grad_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                MeanGradOut=self.mean_grad_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=True)
-        else:
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=False)
-
-        rmsprop_op.run(scope, place)
-
-        atol = 1e-5
-        equal_nan = False
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self,
+                         place,
+                         is_sparse,
+                         centered,
+                         size,
+                         row_num=None,
+                         epsilon=1e-6):
+        self.setup(place, is_sparse, centered, size, row_num, epsilon)
+        self.run_and_check()
+
+    def run_and_check(self):
+        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
+
+        kwargs = {
+            'Param': self.param_name,
+            'Grad': grad_name,
+            'MeanSquare': self.mean_square_name,
+            'Moment': self.moment_name,
+            'LearningRate': self.lr_name,
+            'ParamOut': self.param_name,
+            'MeanSquareOut': self.mean_square_name,
+            'MomentOut': self.moment_name,
+            'epsilon': self.epsilon,
+            'decay': self.decay,
+            'momentum': self.momentum,
+            'centered': self.centered
+        }
 
         if self.centered:
-            atol = 1e-3
-            equal_nan = True
+            kwargs['MeanGrad'] = self.mean_grad_name
+            kwargs['MeanGradOut'] = self.mean_grad_name
+
+        rmsprop_op = Operator('rmsprop', **kwargs)
+        atol = 1e-6
+
+        rmsprop_op.run(self.scope, self.place)
 
         self.check(
-            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+            np.array(self.mean_square_tensor),
+            self.ms_out,
+            self.place,
+            self.mean_square_name,
+            atol=atol)
         self.check(
-            np.array(moment),
+            np.array(self.moment_tensor),
             self.moment_out,
-            place,
+            self.place,
             self.moment_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
         self.check(
-            np.array(param),
+            np.array(self.param_tensor),
             self.param_out,
-            place,
+            self.place,
             self.param_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
 
         if self.centered:
             self.check(
-                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+                np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                self.mean_grad_name)
 
     def test_rmsprop(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
+
+        size = (128, 320)
         for place in places:
-            self.check_with_place(place, False, 1e-6)
-            self.check_with_place(place, False, 1e-10)
-            self.check_with_place(place, True, 1e-6)
-            self.check_with_place(place, True, 1e-10)
+            for centered in [False, True]:
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place, is_sparse=False, centered=centered, size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=512,
+                        size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=60,
+                        size=size)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a252ea547e4d93d83f64fa9cdb3605eeef0a3cf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -0,0 +1,170 @@
+#    Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestROIAlignOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio
+        }
+
+        self.outputs = {'Out': self.out_data}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
+                 bin_size_h, bin_size_w):
+        count = roi_bin_grid_h * roi_bin_grid_w
+        bilinear_pos = np.zeros(
+            [self.channels, self.pooled_height, self.pooled_width, count, 4],
+            np.float32)
+        bilinear_w = np.zeros(
+            [self.pooled_height, self.pooled_width, count, 4], np.float32)
+        for ph in range(self.pooled_width):
+            for pw in range(self.pooled_height):
+                c = 0
+                for iy in range(roi_bin_grid_h):
+                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
+                        bin_size_h / roi_bin_grid_h
+                    for ix in range(roi_bin_grid_w):
+                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
+                            bin_size_w / roi_bin_grid_w
+                        if y < -1.0 or y > self.height or \
+                               x < -1.0 or x > self.width:
+                            continue
+                        if y <= 0:
+                            y = 0
+                        if x <= 0:
+                            x = 0
+                        y_low = int(y)
+                        x_low = int(x)
+                        if y_low >= self.height - 1:
+                            y = y_high = y_low = self.height - 1
+                        else:
+                            y_high = y_low + 1
+                        if x_low >= self.width - 1:
+                            x = x_high = x_low = self.width - 1
+                        else:
+                            x_high = x_low + 1
+                        ly = y - y_low
+                        lx = x - x_low
+                        hy = 1 - ly
+                        hx = 1 - lx
+                        for ch in range(self.channels):
+                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
+                                                                 x_high]
+                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
+                                                                 x_high]
+                        bilinear_w[ph, pw, c, 0] = hy * hx
+                        bilinear_w[ph, pw, c, 1] = hy * lx
+                        bilinear_w[ph, pw, c, 2] = ly * hx
+                        bilinear_w[ph, pw, c, 3] = ly * lx
+                        c = c + 1
+        return bilinear_pos, bilinear_w
+
+    def calc_roi_align(self):
+        self.out_data = np.zeros(
+            (self.rois_num, self.channels, self.pooled_height,
+             self.pooled_width)).astype('float32')
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            x_i = self.x[roi_batch_id]
+            roi_xmin = roi[1] * self.spatial_scale
+            roi_ymin = roi[2] * self.spatial_scale
+            roi_xmax = roi[3] * self.spatial_scale
+            roi_ymax = roi[4] * self.spatial_scale
+            roi_width = max(roi_xmax - roi_xmin, 1)
+            roi_height = max(roi_ymax - roi_ymin, 1)
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_height / self.pooled_height)
+            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_width / self.pooled_width)
+            count = int(roi_bin_grid_h * roi_bin_grid_w)
+            pre_size = count * self.pooled_width * self.pooled_height
+            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
+                                                     int(roi_bin_grid_h),
+                                                     int(roi_bin_grid_w),
+                                                     bin_size_h, bin_size_w)
+            for ch in range(self.channels):
+                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
+                output_val = align_per_bin.mean(axis=-1)
+                self.out_data[i, ch, :, :] = output_val
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype("float32")
+
+    def setUp(self):
+        self.op_type = "roi_align"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003..1a2c9bb5f43d55d8e6183de0d55bfcc2b9ac3f08 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -50,8 +50,10 @@ def rpn_target_assign(anchor_by_gt_overlap,
             fg_inds, size=(len(fg_inds) - num_fg), replace=False)
     else:
         disable_inds = fg_inds[num_fg:]
+
     labels[disable_inds] = -1
     fg_inds = np.where(labels == 1)[0]
+    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
 
     num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
     bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
@@ -59,18 +61,27 @@ def rpn_target_assign(anchor_by_gt_overlap,
         enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
     else:
         enable_inds = bg_inds[:num_bg]
+
+    fg_fake_inds = np.array([], np.int32)
+    fg_value = np.array([fg_inds[0]], np.int32)
+    fake_num = 0
+    for bg_id in enable_inds:
+        if bg_id in fg_inds:
+            fake_num += 1
+            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
     labels[enable_inds] = 0
+
+    bbox_inside_weight[fake_num:, :] = 1
     fg_inds = np.where(labels == 1)[0]
     bg_inds = np.where(labels == 0)[0]
-
-    loc_index = fg_inds
-    score_index = np.hstack((fg_inds, bg_inds))
+    loc_index = np.hstack([fg_fake_inds, fg_inds])
+    score_index = np.hstack([fg_inds, bg_inds])
     labels = labels[score_index]
     assert not np.any(labels == -1), "Wrong labels with -1"
 
-    gt_inds = anchor_to_gt_argmax[fg_inds]
+    gt_inds = anchor_to_gt_argmax[loc_index]
 
-    return loc_index, score_index, labels, gt_inds
+    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
 
 
 def get_anchor(n, c, h, w):
@@ -123,9 +134,12 @@ def rpn_target_assign_in_python(all_anchors,
         gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
         iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
 
-        loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
-            iou, rpn_batch_size_per_im, rpn_positive_overlap,
-            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = \
+                         rpn_target_assign(iou, rpn_batch_size_per_im,
+                                           rpn_positive_overlap,
+                                           rpn_negative_overlap,
+                                           rpn_fg_fraction,
+                                           use_random)
         # unmap to all anchor 
         loc_inds = inds_inside[loc_inds]
         score_inds = inds_inside[score_inds]
@@ -139,6 +153,7 @@ def rpn_target_assign_in_python(all_anchors,
             score_indexes = score_inds
             tgt_labels = labels
             tgt_bboxes = box_deltas
+            bbox_inside_weights = bbox_inside_weight
         else:
             loc_indexes = np.concatenate(
                 [loc_indexes, loc_inds + i * anchor_num])
@@ -146,8 +161,10 @@ def rpn_target_assign_in_python(all_anchors,
                 [score_indexes, score_inds + i * anchor_num])
             tgt_labels = np.concatenate([tgt_labels, labels])
             tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+            bbox_inside_weights = np.vstack([bbox_inside_weights, \
+                                             bbox_inside_weight])
 
-    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights
 
 
 class TestRpnTargetAssignOp(OpTest):
@@ -182,10 +199,12 @@ class TestRpnTargetAssignOp(OpTest):
         rpn_fg_fraction = 0.5
         use_random = False
 
-        loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
-            all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
-            rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
-            rpn_fg_fraction, use_random)
+        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights = \
+            rpn_target_assign_in_python(all_anchors, gt_boxes, is_crowd,
+                                   im_info, lod, rpn_straddle_thresh,
+                                   rpn_batch_size_per_im, rpn_positive_overlap,
+                                   rpn_negative_overlap,
+                                   rpn_fg_fraction, use_random)
         labels = labels[:, np.newaxis]
 
         self.op_type = "rpn_target_assign"
@@ -207,7 +226,8 @@ class TestRpnTargetAssignOp(OpTest):
             'LocationIndex': loc_index.astype('int32'),
             'ScoreIndex': score_index.astype('int32'),
             'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32')
+            'TargetLabel': labels.astype('int32'),
+            'BBoxInsideWeight': bbox_inside_weights.astype('float32')
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index dcc86382e5286f354c4f2e81ead598f12c75b2c1..2285e9496768aea6f48fb7796536e8344839d862 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -20,6 +20,53 @@ import random
 from op_test import OpTest
 
 
+def seqconv(x,
+            lod,
+            filter,
+            context_length,
+            context_start,
+            padding_trainable=False,
+            padding_data=None):
+    [T, M] = x.shape
+    col = np.zeros((T, context_length * M)).astype('float32')
+    offset = [0]
+    for seq_len in lod[0]:
+        offset.append(offset[-1] + seq_len)
+    begin_pad = np.max([0, -context_start])
+    for i in range(len(offset) - 1):
+        for j in range(context_length):
+            in_begin = offset[i] + context_start + j
+            in_end = offset[i + 1] + context_start + j
+            out_begin = offset[i]
+            out_end = offset[i + 1]
+            if in_begin < offset[i]:
+                pad_size = np.min(
+                    [offset[i] - in_begin, offset[i + 1] - offset[i]])
+                if padding_trainable:
+                    sub_w = padding_data[j:j + pad_size, :]
+                    col[offset[i]:offset[i] + pad_size, j * M:(j + 1) *
+                        M] = sub_w
+                out_begin = offset[i] + pad_size
+                in_begin = offset[i]
+
+            if in_end > offset[i + 1]:
+                pad_size = np.min(
+                    [in_end - offset[i + 1], offset[i + 1] - offset[i]])
+                if padding_trainable:
+                    sub_w = padding_data[begin_pad + context_start + j -
+                                         pad_size:begin_pad + context_start +
+                                         j, :]
+                    col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) *
+                        M] = sub_w
+                in_end = offset[i + 1]
+                out_end = offset[i + 1] - pad_size
+            if in_end <= in_begin:
+                continue
+            in_sub = x[in_begin:in_end, :]
+            col[out_begin:out_end, j * M:(j + 1) * M] += in_sub
+    return np.dot(col, filter)
+
+
 class TestSeqProject(OpTest):
     def setUp(self):
         self.init_test_case()
@@ -66,57 +113,9 @@ class TestSeqProject(OpTest):
             'paddingTrainable': self.padding_trainable,
             'contextStride': self.context_stride
         }
-        out = np.zeros(
-            (self.input_size[0], self.output_represention)).astype('float32')
+        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
+                      self.padding_trainable, self.pad_data)
         self.outputs = {'Out': out}
-        self.compute()
-
-    def compute(self):
-        x, lod = self.inputs['X']
-        filter = self.inputs['Filter']
-        pading_data = self.pad_data
-        out = np.zeros((self.input_size[0], self.context_length *
-                        self.input_size[1])).astype('float32')
-        offset = [0]
-        for seq_len in lod[0]:
-            offset.append(offset[-1] + seq_len)
-        begin_pad = np.max([0, -self.context_start])
-
-        for i in range(len(offset) - 1):
-            for j in range(self.context_length):
-                in_begin = offset[i] + self.context_start + j
-                in_end = offset[i + 1] + self.context_start + j
-                out_begin = offset[i]
-                out_end = offset[i + 1]
-                if in_begin < offset[i]:
-                    pad_size = np.min(
-                        [offset[i] - in_begin, offset[i + 1] - offset[i]])
-                    if self.padding_trainable:
-                        sub_w = pading_data[j:j + pad_size, :]
-                        out[offset[i]:offset[i] + pad_size, j * self.input_size[
-                            1]:(j + 1) * self.input_size[1]] = sub_w
-                    out_begin = offset[i] + pad_size
-                    in_begin = offset[i]
-
-                if in_end > offset[i + 1]:
-                    pad_size = np.min(
-                        [in_end - offset[i + 1], offset[i + 1] - offset[i]])
-                    if self.padding_trainable:
-                        sub_w = pading_data[begin_pad + self.context_start + j -
-                                            pad_size:begin_pad +
-                                            self.context_start + j, :]
-                        out[offset[i + 1] - pad_size:offset[i + 1], j * self.
-                            input_size[1]:(j + 1) * self.input_size[1]] = sub_w
-                    in_end = offset[i + 1]
-                    out_end = offset[i + 1] - pad_size
-                if in_end <= in_begin:
-                    continue
-
-                in_sub = x[in_begin:in_end, :]
-                out[out_begin:out_end, j * self.input_size[1]:(j + 1) *
-                    self.input_size[1]] += in_sub
-
-        np.dot(out, filter, out=self.outputs['Out'])
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 641eb03a5fbf1bb140b20cc3518cea83386fa577..a80ad5b079891efe1b0e1222b3c2455d4891d5f5 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
+class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+
+    def test_check_grad(self):
+        """Grad computation does not apply to Sequence MAX 
+            Pool executed when is_test is true """
+        return
+
+
 class TestSeqLastPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..eebd25e0975f1711ea86093f007212cadc6334f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+import numpy as np
+
+
+class TestSequenceReverseBase(OpTest):
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.size = (10, 3, 4)
+        self.lod = [2, 3, 5]
+        self.dtype = 'float32'
+        self.initParameters()
+        self.op_type = 'sequence_reverse'
+        self.x = np.random.random(self.size).astype(self.dtype)
+        self.y = self.get_output()
+
+        self.inputs = {'X': (self.x, [self.lod, ]), }
+        self.outputs = {'Y': (self.y, [self.lod, ]), }
+
+    def get_output(self):
+        tmp_x = np.reshape(self.x, newshape=[self.x.shape[0], -1])
+        tmp_y = np.ndarray(tmp_x.shape).astype(self.dtype)
+        prev_idx = 0
+        for cur_len in self.lod:
+            idx_range = range(prev_idx, prev_idx + cur_len)
+            tmp_y[idx_range, :] = np.flip(tmp_x[idx_range, :], 0)
+            prev_idx += cur_len
+
+        return np.reshape(tmp_y, newshape=self.x.shape).astype(self.dtype)
+
+    def test_output(self):
+        self.check_output(0)
+
+    def test_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+class TestSequenceReserve1(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [4, 5, 3]
+
+
+class TestSequenceReverse2(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [12]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..673b0ea180464b8b8f6f5c6e76d5c5c80f347d25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import six
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequenceUnpadOp(OpTest):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5)
+        self.dtype = "float32"
+
+    def compute(self):
+        assert len(self.length) == self.x_shape[0]
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        out_lod = [self.length]
+
+        out = x[0, 0:self.length[0]]
+        for i in six.moves.xrange(1, x.shape[0]):
+            out = np.append(out, x[i, 0:self.length[i]], axis=0)
+
+        out_shape = (sum(self.length), )
+        if len(self.x_shape) == 2:
+            out_shape = out_shape + (1, )
+        else:
+            out_shape = out_shape + self.x_shape[2:]
+
+        self.inputs = {
+            'X': x,
+            'Length': np.array(self.length).astype('int64').reshape(-1, 1)
+        }
+        self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
+
+    def setUp(self):
+        self.op_type = 'sequence_unpad'
+        self.init()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceUnpadOp2(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5, 4, 3)
+        self.dtype = "float32"
+
+
+class TestSequenceUnpadOp3(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [5, 2, 3, 4]
+        self.x_shape = (4, 5, 3, 3, 6)
+        self.dtype = "float64"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
index fab63b7d5631829feffd26fc1dce2bd338d2036b..b16c744603534ad07f9318a5a195f1d7550b1f10 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -30,7 +30,6 @@ class TestSliceVar(unittest.TestCase):
             var = program.global_block().create_var(
                 name=str(random.randint(10000, 99999)),
                 persistable=True,
-                # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
         blocks = slice_variable(var_list, 10, min_size)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index a18941dd3126ac027f022ddafbbaed8516166233..37ee880970cf7f6f235e7c43697b2b7872bed38b 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -26,7 +26,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
     Test softmax with cross entropy operator with discreate one-hot labels.
     """
 
+    def initParams(self):
+        self.numeric_stable_mode = False
+
     def setUp(self):
+        self.initParams()
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 41
         class_num = 37
@@ -46,6 +50,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             "Softmax": softmax.astype("float64"),
             "Loss": cross_entropy.astype("float64")
         }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
 
     def test_check_output(self):
         self.check_output()
@@ -54,6 +59,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
     """
     Test softmax with cross entropy operator with soft labels.
@@ -93,7 +103,11 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
     Test softmax with cross entropy operator with ignore_index.
     """
 
+    def initParams(self):
+        self.numeric_stable_mode = False
+
     def setUp(self):
+        self.initParams()
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 41
         class_num = 37
@@ -114,7 +128,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
             "Softmax": softmax.astype("float64"),
             "Loss": cross_entropy.astype("float64")
         }
-        self.attrs = {"ignore_index": ignore_index}
+        self.attrs = {
+            "ignore_index": ignore_index,
+            "numeric_stable_mode": self.numeric_stable_mode
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -123,5 +140,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index 4c3d0258980fd8595704a65219deb520b96e222e..d674dad2293921c06135b4ee528538d266cb2904 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -25,18 +25,21 @@ from paddle.fluid.op import Operator
 class TestSplitIdsOp(OpTest):
     def setUp(self):
         self.op_type = "split_ids"
-        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        ids1 = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        ids2 = np.array([[6], [2], [3], [3], [5], [2], [6]]).astype('int64')
+        ids3 = np.array([[2], [2], [2], [3], [5], [5], [6]]).astype('int64')
+
         out0 = np.array([[0], [3], [6]]).astype('int64')
         out1 = np.array([[]]).astype('int64')
-        out2 = np.array([[2], [2], [5], [5]]).astype('int64')
-        self.inputs = {'Ids': ids}
+        out2 = np.array([[2], [5]]).astype('int64')
+        self.inputs = {'Ids': [('ids1', ids1), ('ids2', ids2), ('ids3', ids3)]}
         self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
 
     def test_check_output(self):
         self.check_output()
 
 
-class TestSpliteIds(unittest.TestCase):
+class TestSplitSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
         return places
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 41a5ee59ea523b1f6c5015974a12c526e883fa35..50204b8a77c187aa695da83860960566448d290f 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -99,7 +99,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
         out0_grad.set_height(height)
         out0_grad_tensor = out0_grad.get_tensor()
         np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
         out0_grad_tensor.set(np_array, place)
 
         out1_grad = scope.var("out1@GRAD").get_selected_rows()
@@ -108,7 +107,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
         out1_grad.set_height(height)
         out1_grad_tensor = out1_grad.get_tensor()
         np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        np_array[0, 1] = 4.0
         out1_grad_tensor.set(np_array, place)
 
         x_grad = scope.var("X@GRAD").get_selected_rows()
@@ -121,11 +119,13 @@ class TestSpliteSelectedRows(unittest.TestCase):
 
         grad_op.run(scope, place)
 
-        self.assertEqual(x_grad.rows(), rows0 + rows1)
+        merged_rows = set(rows0 + rows1)
+        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
         self.assertEqual(x_grad.height(), height)
 
+        print(np.array(x_grad.get_tensor()))
         self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1])
+        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 74797bb65678404b7b35d06eecc7f9a12b2a346e..e20418ff1c8d21f3a3e4ba15ff2aa9d54f37f4b2 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -45,16 +45,30 @@ class TestSumOp(OpTest):
 
 
 class TestSelectedRowsSumOp(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-        self.check_input_and_optput(scope, place, True, True, True)
-        self.check_input_and_optput(scope, place, False, True, True)
-        self.check_input_and_optput(scope, place, False, False, True)
-        self.check_input_and_optput(scope, place, False, False, False)
+    def check_with_place(self, place, inplace):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+
+        self.check_input_and_optput(core.Scope(), place, inplace, True, True,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, True,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
+                                    False)
+
+    def _get_array(self, row_num, row_numel):
+        array = np.ones((row_num, row_numel)).astype("float32")
+        for i in range(row_num):
+            array[i] *= i
+        return array
 
     def check_input_and_optput(self,
                                scope,
                                place,
+                               inplace,
                                w1_has_data=False,
                                w2_has_data=False,
                                w3_has_data=False):
@@ -64,35 +78,43 @@ class TestSelectedRowsSumOp(OpTest):
         self.create_selected_rows(scope, place, "W3", w3_has_data)
 
         # create Out Variable
-        out = scope.var('Out').get_selected_rows()
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
 
         # create and run sum operator
-        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
         sum_op.run(scope, place)
 
         has_data_w_num = 0
-        for w in [w1_has_data, w2_has_data, w3_has_data]:
-            if not w:
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
                 has_data_w_num += 1
 
-        self.assertEqual(7 * has_data_w_num, len(out.rows()))
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            self.assertTrue(
+                np.array_equal(
+                    np.array(out.get_tensor()),
+                    self._get_array(len(self.rows), self.row_numel) *
+                    has_data_w_num))
+        else:
+            self.assertEqual(len(out.rows()), 0)
 
-    def create_selected_rows(self, scope, place, var_name, isEmpty):
+    def create_selected_rows(self, scope, place, var_name, has_data):
         # create and initialize W Variable
-        if not isEmpty:
-            rows = [0, 1, 2, 3, 4, 5, 6]
-            row_numel = 12
+        if has_data:
+            rows = self.rows
         else:
             rows = []
-            row_numel = 12
 
         var = scope.var(var_name)
         w_selected_rows = var.get_selected_rows()
-        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_height(self.height)
         w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
+        w_array = self._get_array(len(rows), self.row_numel)
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
@@ -100,9 +122,11 @@ class TestSelectedRowsSumOp(OpTest):
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
-        # currently only support CPU
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
         for place in places:
-            self.check_with_place(place)
+            for inplace in [True, False]:
+                self.check_with_place(place, inplace)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index e54e170f7f1e03db4b63db72edb7395d18130f68..69b29db83a43d18c0825b610642009a0377b9901 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -21,22 +21,27 @@ from op_test import OpTest
 
 class TestTopkOp(OpTest):
     def setUp(self):
+        self.set_args()
         self.op_type = "top_k"
-        k = 1
-        input = np.random.random((32, 84)).astype("float32")
-        output = np.ndarray((32, k))
-        indices = np.ndarray((32, k)).astype("int64")
+        k = self.top_k
+        input = np.random.random((self.row, k)).astype("float32")
+        output = np.ndarray((self.row, k))
+        indices = np.ndarray((self.row, k)).astype("int64")
 
         self.inputs = {'X': input}
         self.attrs = {'k': k}
 
-        for rowid in range(32):
+        for rowid in range(self.row):
             row = input[rowid]
-            output[rowid] = np.sort(row)[-k:]
-            indices[rowid] = row.argsort()[-k:]
+            output[rowid] = np.sort(row)[::-1][:k]
+            indices[rowid] = row.argsort()[::-1][:k]
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def set_args(self):
+        self.row = 32
+        self.top_k = 1
+
     def test_check_output(self):
         self.check_output()
 
@@ -50,14 +55,39 @@ class TestTopkOp3d(OpTest):
         output = np.ndarray((64, k))
         indices = np.ndarray((64, k)).astype("int64")
 
-        # FIXME: should use 'X': input for a 3d input
-        self.inputs = {'X': input_flat_2d}
+        self.inputs = {'X': input}
         self.attrs = {'k': k}
 
         for rowid in range(64):
             row = input_flat_2d[rowid]
-            output[rowid] = np.sort(row)[-k:]
-            indices[rowid] = row.argsort()[-k:]
+            output[rowid] = np.sort(row)[::-1][:k]
+            indices[rowid] = row.argsort()[::-1][:k]
+
+        self.outputs = {
+            'Out': output.reshape((32, 2, k)),
+            'Indices': indices.reshape((32, 2, k))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTopkOp2(OpTest):
+    def setUp(self):
+        self.op_type = "top_k"
+        k = 1
+        m = 2056
+        input = np.random.random((m, 84)).astype("float32")
+        output = np.ndarray((m, k))
+        indices = np.ndarray((m, k)).astype("int64")
+
+        self.inputs = {'X': input}
+        self.attrs = {'k': k}
+
+        for rowid in range(m):
+            row = input[rowid]
+            output[rowid] = -np.sort(-row)[:k]
+            indices[rowid] = (-row).argsort()[:k]
 
         self.outputs = {'Out': output, 'Indices': indices}
 
@@ -65,5 +95,17 @@ class TestTopkOp3d(OpTest):
         self.check_output()
 
 
+class TestTopkOp3(TestTopkOp):
+    def set_args(self):
+        self.row = 2056
+        self.top_k = 3
+
+
+class TestTopkOp4(TestTopkOp):
+    def set_args(self):
+        self.row = 40000
+        self.top_k = 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 91db85b8ec6a32fee3b7aa8ab76429a4a197fcc3..9066fc9d1bf13176862f6debf0ed0bedaaaf3eba 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -35,6 +35,7 @@ import sys
 import numpy as np
 import collections
 import six
+import logging
 
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -49,6 +50,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
 )
+OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
@@ -474,6 +476,26 @@ class DistributeTranspiler(object):
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
 
+        # delete table init op
+        if self.has_distributed_lookup_table:
+            table_var = self.startup_program.global_block().vars[
+                self.table_name]
+            table_param_init_op = []
+            for op in self.startup_program.global_block().ops:
+                if self.table_name in op.output_arg_names:
+                    table_param_init_op.append(op)
+            init_op_num = len(table_param_init_op)
+            if init_op_num != 1:
+                raise ValueError("table init op num should be 1, now is " + str(
+                    init_op_num))
+            table_init_op = table_param_init_op[0]
+            self.startup_program.global_block().append_op(
+                type="fake_init",
+                inputs={},
+                outputs={"Out": table_var},
+                attrs={"shape": table_init_op.attr('shape')})
+            delete_ops(self.startup_program.global_block(), table_param_init_op)
+
         self.origin_program.__str__()
 
         if wait_port:
@@ -712,7 +734,7 @@ in a single call.")
                 for _, op in enumerate(self.optimize_ops):
                     # optimizer is connected to itself
                     if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
-                        op not in global_ops:
+                            op not in global_ops:
                         log("append opt op: ", op.type, op.input_arg_names,
                             merged_var)
                         __append_optimize_op__(op, per_opt_block,
@@ -746,6 +768,15 @@ in a single call.")
             prefetch_var_name_to_block_id.extend(
                 lookup_table_var_name_to_block_id)
 
+        if len(optimize_blocks) == 0:
+            logging.warn("pserver [" + str(endpoint) +
+                         "] has no optimize block!!")
+            pre_block_idx = pserver_program.num_blocks - 1
+            empty_block = pserver_program._create_block(pre_block_idx)
+            optimize_blocks.append(empty_block)
+
+        # In some case, some parameter server will have no parameter to optimize
+        # So we give an empty optimize block to parameter server.
         attrs = {
             "optimize_blocks": optimize_blocks,
             "endpoint": endpoint,
@@ -889,11 +920,11 @@ to transpile() call.")
             block_idx = int(block_name.split(block_suffix)[1])
             orig_var = self.origin_program.global_block().vars[orig_var_name]
 
-            skip_numel = 0
+            skip_dim0 = 0
             slice_vars = self.param_var_mapping[orig_var_name]
             for slice_var in slice_vars[:block_idx]:
-                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
-            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+                skip_dim0 += slice_var.shape[0]
+            slice_vars_and_attrs.append([orig_var, skip_dim0, param])
 
         return slice_vars_and_attrs
 
@@ -1033,92 +1064,90 @@ to transpile() call.")
     def _replace_lookup_table_op_with_prefetch(self, program,
                                                pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        # self.all_prefetch_input_vars =
-        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
-        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_in_ids_vars = []
         self.all_prefetch_input_vars = []
-
-        # self.all_prefetch_input_vars =
-        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
-        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
         self.all_prefetch_output_vars = []
+        self.all_out_emb_vars = []
+        lookup_table_op_index = -1
 
         continue_search_lookup_table_op = True
         while continue_search_lookup_table_op:
             continue_search_lookup_table_op = False
             all_ops = program.global_block().ops
             for op in all_ops:
-                if op.type == LOOKUP_TABLE_TYPE:
+                if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input(
+                        "W")[0]:
+                    if not op.attr('is_distributed'):
+                        raise RuntimeError(
+                            "lookup_table_op that lookup an distributed embedding table"
+                            "should set is_distributed to true")
                     continue_search_lookup_table_op = True
 
-                    lookup_table_op_index = list(all_ops).index(op)
+                    lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list(
+                        all_ops).index(op)
                     ids_name = op.input("Ids")
                     out_name = op.output("Out")
 
                     ids_var = program.global_block().vars[ids_name[0]]
-                    prefetch_input_vars = self._create_splited_vars(
-                        source_var=ids_var,
-                        block=program.global_block(),
-                        tag="_prefetch_in_")
-                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+                    self.all_in_ids_vars.append(ids_var)
 
                     out_var = program.global_block().vars[out_name[0]]
-                    prefetch_output_vars = self._create_splited_vars(
-                        source_var=out_var,
-                        block=program.global_block(),
-                        tag="_prefetch_out_")
-                    self.all_prefetch_output_vars.append(prefetch_output_vars)
-
-                    # insert split_ids_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index,
-                        type="split_ids",
-                        inputs={
-                            'Ids': [
-                                program.global_block().vars[varname]
-                                for varname in ids_name
-                            ]
-                        },
-                        outputs={"Out": prefetch_input_vars})
-
-                    # insert prefetch_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index + 1,
-                        type="prefetch",
-                        inputs={'X': prefetch_input_vars},
-                        outputs={"Out": prefetch_output_vars},
-                        attrs={
-                            "epmap": pserver_endpoints,
-                            # FIXME(qiao) temporarily disable this config because prefetch
-                            # is not act as other rpc op, it's more like a forward op
-                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                        })
-
-                    # insert concat_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index + 2,
-                        type="merge_ids",
-                        inputs={
-                            'Ids': [
-                                program.global_block().vars[varname]
-                                for varname in ids_name
-                            ],
-                            'X': prefetch_output_vars
-                        },
-                        outputs={
-                            "Out": [
-                                program.global_block().vars[varname]
-                                for varname in out_name
-                            ]
-                        })
+                    self.all_out_emb_vars.append(out_var)
 
                     # delete lookup_table_op
                     delete_ops(program.global_block(), [op])
                     # break for loop
                     break
 
+        for index in range(len(self.pserver_endpoints)):
+            in_var = program.global_block().create_var(
+                name=str("prefetch_compress_in_tmp_" + str(index)),
+                type=self.all_in_ids_vars[0].type,
+                shape=self.all_in_ids_vars[0].shape,
+                dtype=self.all_in_ids_vars[0].dtype)
+            self.all_prefetch_input_vars.append(in_var)
+
+            out_var = program.global_block().create_var(
+                name=str("prefetch_compress_out_tmp_" + str(index)),
+                type=self.all_out_emb_vars[0].type,
+                shape=self.all_out_emb_vars[0].shape,
+                dtype=self.all_out_emb_vars[0].dtype)
+            self.all_prefetch_output_vars.append(out_var)
+
+        # insert split_ids_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index,
+            type="split_ids",
+            inputs={'Ids': self.all_in_ids_vars},
+            outputs={"Out": self.all_prefetch_input_vars})
+
+        # insert prefetch_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index + 1,
+            type="prefetch",
+            inputs={'X': self.all_prefetch_input_vars},
+            outputs={"Out": self.all_prefetch_output_vars},
+            attrs={
+                "epmap": pserver_endpoints,
+                # FIXME(qiao) temporarily disable this config because prefetch
+                # is not act as other rpc op, it's more like a forward op
+                # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+        # insert concat_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index + 2,
+            type="merge_ids",
+            inputs={
+                'Ids': self.all_in_ids_vars,
+                'Rows': self.all_prefetch_input_vars,
+                'X': self.all_prefetch_output_vars
+            },
+            outputs={"Out": self.all_out_emb_vars})
+
     def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_op to send gradient to pservers
+
         # there should only be one table_name
         all_ops = program.global_block().ops
         table_grad_name = grad_var_name(self.table_name)
@@ -1132,7 +1161,8 @@ to transpile() call.")
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
-                    outputs={"Out": self.trainer_side_table_grad_list})
+                    outputs={"Out": self.trainer_side_table_grad_list},
+                    attrs={RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE})
                 program.global_block()._insert_op(
                     index=op_index + 2,
                     type="send",
@@ -1143,7 +1173,7 @@ to transpile() call.")
                         if self.sync_mode else []
                     },
                     attrs={
-                        "sync_mode": self.sync_mode,
+                        "sync_mode": not self.sync_mode,
                         "epmap": pserver_endpoints,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME: [
@@ -1158,38 +1188,45 @@ to transpile() call.")
         # STEP: create prefetch block
         table_var = pserver_program.global_block().vars[self.table_name]
         prefetch_var_name_to_block_id = []
-        for index in range(len(self.all_prefetch_input_vars)):
-            prefetch_block = pserver_program._create_block(optimize_block.idx)
-            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
-            pserver_ids = pserver_program.global_block().create_var(
-                name=trainer_ids.name,
-                type=trainer_ids.type,
-                shape=trainer_ids.shape,
-                dtype=trainer_ids.dtype)
-            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
-            pserver_out = pserver_program.global_block().create_var(
-                name=trainer_out.name,
-                type=trainer_out.type,
-                shape=trainer_out.shape,
-                dtype=trainer_out.dtype)
-            prefetch_block.append_op(
-                type="lookup_sparse_table",
-                inputs={'Ids': pserver_ids,
-                        "W": table_var},
-                outputs={"Out": pserver_out},
-                attrs={
-                    "is_sparse": True,  # has no effect on lookup_table op
-                    "is_distributed": True,
-                    "padding_idx": -1
-                })
-            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
-                prefetch_block.idx))
+        prefetch_block = pserver_program._create_block(optimize_block.idx)
+        trainer_ids = self.all_prefetch_input_vars[pserver_index]
+        pserver_ids = pserver_program.global_block().create_var(
+            name=trainer_ids.name,
+            type=trainer_ids.type,
+            shape=trainer_ids.shape,
+            dtype=trainer_ids.dtype)
+        trainer_out = self.all_prefetch_output_vars[pserver_index]
+        pserver_out = pserver_program.global_block().create_var(
+            name=trainer_out.name,
+            type=trainer_out.type,
+            shape=trainer_out.shape,
+            dtype=trainer_out.dtype)
+        prefetch_block.append_op(
+            type="lookup_sparse_table",
+            inputs={'Ids': pserver_ids,
+                    "W": table_var},
+            outputs={"Out": pserver_out},
+            attrs={
+                "is_sparse": True,  # has no effect on lookup_table op
+                "is_distributed": True,
+                "padding_idx": -1
+            })
+        prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+            prefetch_block.idx))
         return prefetch_var_name_to_block_id
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
                                      pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
+        table_opt_block = pserver_program._create_block(pre_block_idx)
         # create table param and grad var in pserver program
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
+        ][0]
+
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
 
@@ -1205,19 +1242,16 @@ to transpile() call.")
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
+
         # parameter must be selected rows
         param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         grad_var = pserver_program.global_block()._clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
                 self.table_name)])
 
-        # create table optimize block in pserver program
-        table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
-        ][0]
-        table_opt_block = pserver_program._create_block(pre_block_idx)
+        lr_var = pserver_program.global_block()._clone_variable(
+            self.origin_program.global_block().vars[table_opt_op.input(
+                "LearningRate")[0]])
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -1249,8 +1283,6 @@ to transpile() call.")
             grad_var = pserver_program.global_block()._rename_var(
                 origin_grad_name, splited_grad_name)
 
-        lr_var = pserver_program.global_block().vars[table_opt_op.input(
-            "LearningRate")[0]]
         inputs = {
             "Param": [param_var],
             "Grad": [grad_var],
@@ -1258,7 +1290,6 @@ to transpile() call.")
         }
         outputs = {"ParamOut": [param_var]}
         # only support sgd now
-        import logging
         logging.warn(
             "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
             + table_opt_op.type)
@@ -1359,16 +1390,6 @@ to transpile() call.")
             program.global_block()._sync_with_cpp()
         return var_mapping
 
-    def _create_splited_vars(self, source_var, block, tag):
-        return [
-            block.create_var(
-                name=str(source_var.name + tag + str(index)),
-                type=source_var.type,
-                shape=source_var.shape,
-                dtype=source_var.dtype)
-            for index in range(len(self.pserver_endpoints))
-        ]
-
     def _clone_var(self, block, var, persistable=True):
         return block.create_var(
             name=var.name,
@@ -1426,7 +1447,7 @@ to transpile() call.")
         elif op_type == "adamax":
             if varkey in ["Moment", "InfNorm"]:
                 return param_shape
-        elif op_type == "momentum":
+        elif op_type in ["momentum", "lars_momentum"]:
             if varkey == "Velocity":
                 return param_shape
         elif op_type == "rmsprop":
@@ -1435,8 +1456,15 @@ to transpile() call.")
         elif op_type == "decayed_adagrad":
             if varkey == "Moment":
                 return param_shape
+        elif op_type == "ftrl":
+            if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
+                return param_shape
         elif op_type == "sgd":
             pass
+        else:
+            raise ValueError(
+                "Not supported optimizer for distributed training: %s" %
+                op_type)
         return orig_shape
 
     def _get_varname_parts(self, varname):
@@ -1713,8 +1741,10 @@ to transpile() call.")
         lr_ops = []
         block = self.origin_program.global_block()
         for op in block.ops:
-            if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int(
-                    LR_SCHED_OP_ROLE_ATTR_VALUE):
+            role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
+            if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
+                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
+                    int(OPT_OP_ROLE_ATTR_VALUE):
                 lr_ops.append(op)
                 log("append lr op: ", op.type)
         return lr_ops
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index c402535b27142e94af339a6c18401ba20bc6564d..9a13cecc646e8534a157fad882fd97836348deb4 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
 
+        if use_mkldnn:
+            self._depthwise_conv_mkldnn(program)
+
         self._fuse_batch_norm(program, place, scope)
         if use_mkldnn:
             self._fuse_conv_bias_mkldnn(program)
@@ -70,11 +73,36 @@ class InferenceTranspiler(object):
                 program)  # ResNet residual block merging
             self._fuse_bn_relu_mkldnn(program)
 
+    def _depthwise_conv_mkldnn(self, program):
+        '''
+        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
+        The result is:
+            - before:
+                - any_other_op->depthwise_conv->any_other_op
+            - after:
+                - any_other_op->conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type == 'depthwise_conv2d':
+                current_op.desc.set_type("conv2d")
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
     def _fuse_conv_eltwise_mkldnn(self, program):
         '''
         Transpile the program fusing elementwise_add into conv for MKLDNN
         program. Elementwise add following convolution OP can be fused by adding
-        'fuse_eltwise' attribute to convolution OP and replacing its output
+        'fuse_residual_connection' attribute to convolution OP and replacing its output
         Tensor with second parameter of elementwise_add.
         The result of fuse is:
             - before:
@@ -92,7 +120,8 @@ class InferenceTranspiler(object):
             if current_op.type in ['conv2d']:
                 next_op = self.block.ops[i + 1]
                 if next_op.type == 'elementwise_add':
-                    self._fuse_conv_eltwise(current_op, next_op)
+                    self._fuse_conv_eltwise(i, current_op, next_op)
+                    self.block._remove_op(i + 1)  # Remove old conv
                     self.block._remove_op(i + 1)  # Remove elementwise_add
             i = i + 1
         self._adjust_input()
@@ -444,7 +473,7 @@ class InferenceTranspiler(object):
             outputs={"Output": out_var},
             attrs=attrs)
 
-    def _fuse_conv_eltwise(self, conv_op, eltwise_op):
+    def _fuse_conv_eltwise(self, index, conv_op, eltwise_op):
         '''
         fuse the conv op with elementwise_add
 
@@ -454,9 +483,30 @@ class InferenceTranspiler(object):
         :type eltwise_op: Operator
         '''
 
-        conv_op._set_attr("fuse_eltwise", True)
-        self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0]
-        self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]
+        eltwise_input = "X"
+        if eltwise_op.input("X")[0] == conv_op.output("Output")[0]:
+            eltwise_input = "Y"
+
+        residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]]
+        out_var = self.block.vars[eltwise_op.output("Out")[0]]
+        filter_var = self.block.vars[conv_op.input("Filter")[0]]
+        in_var = self.block.vars[conv_op.input("Input")[0]]
+        bias_var = self.block.vars[conv_op.input("Bias")[0]]
+
+        conv_op._set_attr("fuse_residual_connection", True)
+        attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
+
+        self.block._insert_op(
+            index,
+            type="conv2d",
+            inputs={
+                "Input": in_var,
+                "Filter": filter_var,
+                "Bias": bias_var,
+                "ResidualData": residual_var
+            },
+            outputs={"Output": out_var},
+            attrs=attrs)
 
     def _adjust_input(self):
         for i in range(len(self.block.ops)):
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 861bb5fae5d7a8561ded1f547fbb86ae1e1a073e..c9f1be934773cc28f026f2b867b9e3a4f7aa8472 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -171,7 +171,7 @@ class ControlFlowGraph(object):
                 self._live_out[i] |= self._live_in[s]
             self._live_in[i] = self._uses[i] | (
                 self._live_out[i] - self._defs[i])
-            if live_in[i] != self._live_in[i]:
+            if live_in[i] != set(self._live_in[i]):
                 for d in self._presuccessors[i]:
                     worklist.append(d)
 
@@ -321,8 +321,7 @@ class ControlFlowGraph(object):
 
                         if not compare_shape(x_shape, cache_shape, level):
                             continue
-                        # TODO(qijun): actually, we should compare
-                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
                         if x_dtype != cache_dtype:
                             continue
 
@@ -487,7 +486,6 @@ def memory_optimize(input_program,
             skip_opt_set = grad_set
         else:
             skip_opt_set.update(grad_set)
-
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 15595d208583b567b8f768c8d7bd84986ca5a03f..db6fe2d5fff4ed1617d793faee23f01395841768 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['dump_config']
+from .plot import Ploter
+__all__ = ['dump_config', 'Ploter']
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..08889c0313fc24151cde6ca7b662d81eb53c9d7b
--- /dev/null
+++ b/python/paddle/utils/plot.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class PlotData(object):
+    def __init__(self):
+        self.step = []
+        self.value = []
+
+    def append(self, step, value):
+        self.step.append(step)
+        self.value.append(value)
+
+    def reset(self):
+        self.step = []
+        self.value = []
+
+
+class Ploter(object):
+    """
+        Plot input data in a 2D graph
+        
+        Args:
+            title: assign the title of input data.
+            step: x_axis of the data.
+            value: y_axis of the data.
+    """
+
+    def __init__(self, *args):
+        self.__args__ = args
+        self.__plot_data__ = {}
+        for title in args:
+            self.__plot_data__[title] = PlotData()
+        # demo in notebooks will use Ploter to plot figure, but when we convert
+        # the ipydb to py file for testing, the import of matplotlib will make the
+        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
+        # these libs
+        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
+        if not self.__plot_is_disabled__():
+            import matplotlib.pyplot as plt
+            from IPython import display
+            self.plt = plt
+            self.display = display
+
+    def __plot_is_disabled__(self):
+        return self.__disable_plot__ == "True"
+
+    def append(self, title, step, value):
+        """
+	    Feed data
+	    
+            Args:
+                title: assign the group data to this subtitle.
+                step: the x_axis of data.
+                value: the y_axis of data.
+            
+            Examples:
+                .. code-block:: python
+                plot_curve = Ploter("Curve 1","Curve 2")
+                plot_curve.append(title="Curve 1",step=1,value=1)
+	"""
+        assert isinstance(title, basestring)
+        assert self.__plot_data__.has_key(title)
+        data = self.__plot_data__[title]
+        assert isinstance(data, PlotData)
+        data.append(step, value)
+
+    def plot(self, path=None):
+        """
+            Plot data in a 2D graph
+
+            Args:
+                path: store the figure to this file path. Defaul None. 
+              
+            Examples:
+                .. code-block:: python
+                plot_curve = Ploter()
+                plot_cure.plot()
+	"""
+        if self.__plot_is_disabled__():
+            return
+
+        titles = []
+        for title in self.__args__:
+            data = self.__plot_data__[title]
+            assert isinstance(data, PlotData)
+            if len(data.step) > 0:
+                titles.append(title)
+                self.plt.plot(data.step, data.value)
+        self.plt.legend(titles, loc='upper left')
+        if path is None:
+            self.display.clear_output(wait=True)
+            self.display.display(self.plt.gcf())
+        else:
+            self.plt.savefig(path)
+        self.plt.gcf().clear()
+
+    def reset(self):
+        for key in self.__plot_data__:
+            data = self.__plot_data__[key]
+            assert isinstance(data, PlotData)
+            data.reset()