diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 68a3d42d7a8a8082730f4cae3b5d4ea33819ca2f..99c9d79b068f5886012fd702d84d0666b9d197b5 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,5 +122,9 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, uses inference transpiler to optimize the program.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
old mode 100644
new mode 100755
index ece1102dce987cda994ff086b07f756498ce26e6..dcd4d9ea95d816029317a29055b5ca8273ac9f43
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe = fluid.Executor(place)
     exe.run(startup_prog)
 
+    # Use inference_transpiler to speedup
+    if args.use_inference_transpiler:
+        t = fluid.InferenceTranspiler()
+        t.transpile(infer_prog, place)
+
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index f1cd9c99ebfe5dc5ee0d46d61f1e08256c27d9cd..d205e3958234cabfbfeba8c3d725fe618ce48ace 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
     endforeach()
 endfunction()
 
-# download library
-message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
+    # download library
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()
 
 if (WITH_ANAKIN)
     message(STATUS "Anakin for inference is enabled")
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index ffdf91a354bd92bdaf3f88344f0a9256638b568c..85f40585da29bab9a107f5546e64870975f4c2d3 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
     # NOTE(wuyi):
     # this package is generated by following steps:
     # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4a49a92f2b131bbb38fcf93070ea811e0b1a14e8..ce6a88b51dc98ac46dd3935f12658d60d364ba8c 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
 ADD_LIBRARY(cblas STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
+ELSE()
+  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 0e2df86c19086357ab520edfcd8421e35768c928..fd7fc16bff5651f022b484623243048fbd225b5a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -195,6 +209,15 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      # Only deps libmklml.so, not link
+      if("${cc_library_DEPS};" MATCHES "mklml;")
+        list(REMOVE_ITEM cc_library_DEPS mklml)
+        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
+          list(APPEND cc_library_DEPS dynload_mklml)
+        endif()
+        add_dependencies(${TARGET_NAME} mklml)
+        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+      endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
@@ -241,6 +264,7 @@ function(cc_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -305,6 +329,7 @@ function(nv_test TARGET_NAME)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -552,7 +577,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cd44fe2542bfa8c53721d61b70778226e640d375..0c720faa353438b76a72e1574cb90931ddd0cf73 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
@@ -149,21 +136,33 @@ copy(memory_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )
 
-set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
-)
+set(inference_deps paddle_fluid_shared paddle_fluid)
 
 if(WITH_CONTRIB)
-   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-   copy(contrib_inference_lib DEPS paddle_inference_api
+    message(STATUS "installing contrib")
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
-   )
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
+  list(APPEND inference_deps contrib_inference_lib)
 endif()
 
+set(module "inference")
+copy(inference_lib DEPS ${inference_deps}
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
index 3ba096388fc87dda3096a9030fe5749e61112c06..aeefbb95a46e5d5ed46375e388a720fad2711779 100644
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-clip
-====
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
 
 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByValue:
+
 GradientClipByValue
 -------------------
 
@@ -19,6 +23,8 @@ GradientClipByValue
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByNorm:
+
 GradientClipByNorm
 ------------------
 
@@ -26,6 +32,8 @@ GradientClipByNorm
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
 GradientClipByGlobalNorm
 ------------------------
 
@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
     :members:
     :noindex:
 
-append_gradient_clip_ops
-------------------------
-
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-
-error_clip_callback
--------------------
-
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
-
diff --git a/doc/fluid/api/data.rst b/doc/fluid/api/data.rst
deleted file mode 100644
index b56c7332cc284649c7e04328e51a7faa78593a39..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-==================================
-Data Reader Interface and DataSets
-==================================
-
-..  toctree::
-    :maxdepth: 1
-
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
index 3df5c0307ffed9d101da58b385840b115920e906..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8 100644
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
 
 DataFeeder
 ----------
diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
deleted file mode 100644
index c0dc9a0d1d9f2f70948dc3c905dca25d7dd43742..0000000000000000000000000000000000000000
--- a/doc/fluid/api/evaluator.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=========
-evaluator
-=========
-
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index f67a14c49f372e67d18ec8e6f87da01109376d22..db2842e7f23e74130a966bb347004bee1ccb08fd 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
 
 Executor
 --------
@@ -12,24 +14,32 @@ Executor
     :members:
     :noindex:
 
+.. _api_fluid_executor_global_scope:
+
 global_scope
 ------------
 
 ..  autofunction:: paddle.fluid.executor.global_scope
     :noindex:
 
+.. _api_fluid_executor_scope_guard:
+
 scope_guard
 -----------
 
 ..  autofunction:: paddle.fluid.executor.scope_guard
     :noindex:
 
-switch_scope
-------------
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
 
-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
     :noindex:
 
+.. _api_fluid_executor_fetch_var:
+
 fetch_var
 ---------
 
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..51cdfe0c2ed045a5b3247c4fdec9868d756eae86
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,378 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+
+.. _api_fluid_Go:
+
+Go
+--
+
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
index 89ab880301b6ac687fd61f556f87f03792c37da3..02efce2bf8392c62a7600c272bedcadc6563f927 100644
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,19 +29,27 @@ def parse_arg():
 
 
 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
         self.stream = stream
-        self.module_name = module_name
-        if not hasattr(fluid, module_name):
-            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        if module_name is None:
+            self.module_name = "fluid"
         else:
-            self.module = getattr(fluid, module_name)
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
         self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
 ''')
 
-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)
 
     def print_submodule(self, submodule_name):
         submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
         self._print_header_(name, dot='=', is_title=False)
 
     def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
         if isinstance(item, types.TypeType):
             self.print_class(name)
         elif isinstance(item, types.FunctionType):
             self.print_method(name)
         else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass
 
     def print_class(self, name):
+        self._print_ref_(name)
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
     :members:
     :noindex:
 
 '''.format(self.module_name, name))
 
     def print_method(self, name):
+        self._print_ref_(name)
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
     :noindex:
 
 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
         self.stream.write('\n')
         self.stream.write('\n')
 
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+
 
 def main():
     args = parse_arg()
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index 9ce6a9a7c329055a755cdb0a40c8c1c2af09a61c..b14ee29873c50fd011f6c48b754767ac8918252a 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
 
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
   python gen_doc.py ${module} > ${module}.rst
 done
+
+python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index 29cea9c68221b921939e8e09072d87f9f604e21b..359406819a993e7eaf2155c839373df44d97b103 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -1,10 +1,11 @@
-======================
-Fluid
-======================
+=============
+API Reference
+=============
 
 ..  toctree::
     :maxdepth: 1
 
+    fluid.rst
     layers.rst
     data_feeder.rst
     executor.rst
@@ -18,3 +19,8 @@ Fluid
     regularizer.rst
     io.rst
     data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index 57efc9823ca0300018b4704e2e32105176970e6b..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-initializer
-===========
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
 
 Constant
 --------
@@ -12,6 +14,8 @@ Constant
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Uniform:
+
 Uniform
 -------
 
@@ -19,6 +23,8 @@ Uniform
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Normal:
+
 Normal
 ------
 
@@ -26,6 +32,8 @@ Normal
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Xavier:
+
 Xavier
 ------
 
@@ -33,6 +41,8 @@ Xavier
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Bilinear:
+
 Bilinear
 --------
 
@@ -40,18 +50,33 @@ Bilinear
     :members:
     :noindex:
 
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
 force_init_on_cpu
 -----------------
 
 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
     :noindex:
 
+.. _api_fluid_initializer_init_on_cpu:
+
 init_on_cpu
 -----------
 
 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
     :noindex:
 
+.. _api_fluid_initializer_ConstantInitializer:
+
 ConstantInitializer
 -------------------
 
@@ -59,6 +84,8 @@ ConstantInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_UniformInitializer:
+
 UniformInitializer
 ------------------
 
@@ -66,6 +93,8 @@ UniformInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_NormalInitializer:
+
 NormalInitializer
 -----------------
 
@@ -73,6 +102,8 @@ NormalInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_XavierInitializer:
+
 XavierInitializer
 -----------------
 
@@ -80,6 +111,8 @@ XavierInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_BilinearInitializer:
+
 BilinearInitializer
 -------------------
 
@@ -87,3 +120,12 @@ BilinearInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index 21334c9edaada4398ec53455e31625d29f67dc54..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==
-io
-==
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
 
 save_vars
 ---------
@@ -11,84 +13,112 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
     :noindex:
 
+.. _api_fluid_io_save_params:
+
 save_params
 -----------
 
 ..  autofunction:: paddle.fluid.io.save_params
     :noindex:
 
+.. _api_fluid_io_save_persistables:
+
 save_persistables
 -----------------
 
 ..  autofunction:: paddle.fluid.io.save_persistables
     :noindex:
 
+.. _api_fluid_io_load_vars:
+
 load_vars
 ---------
 
 ..  autofunction:: paddle.fluid.io.load_vars
     :noindex:
 
+.. _api_fluid_io_load_params:
+
 load_params
 -----------
 
 ..  autofunction:: paddle.fluid.io.load_params
     :noindex:
 
+.. _api_fluid_io_load_persistables:
+
 load_persistables
 -----------------
 
 ..  autofunction:: paddle.fluid.io.load_persistables
     :noindex:
 
+.. _api_fluid_io_save_inference_model:
+
 save_inference_model
 --------------------
 
 ..  autofunction:: paddle.fluid.io.save_inference_model
     :noindex:
 
+.. _api_fluid_io_load_inference_model:
+
 load_inference_model
 --------------------
 
 ..  autofunction:: paddle.fluid.io.load_inference_model
     :noindex:
 
+.. _api_fluid_io_get_inference_program:
+
 get_inference_program
 ---------------------
 
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+.. _api_fluid_io_save_checkpoint:
+
 save_checkpoint
 ---------------
 
 ..  autofunction:: paddle.fluid.io.save_checkpoint
     :noindex:
 
+.. _api_fluid_io_load_checkpoint:
+
 load_checkpoint
 ---------------
 
 ..  autofunction:: paddle.fluid.io.load_checkpoint
     :noindex:
 
+.. _api_fluid_io_clean_checkpoint:
+
 clean_checkpoint
 ----------------
 
 ..  autofunction:: paddle.fluid.io.clean_checkpoint
     :noindex:
 
+.. _api_fluid_io_load_persist_vars_without_grad:
+
 load_persist_vars_without_grad
 ------------------------------
 
 ..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
     :noindex:
 
+.. _api_fluid_io_save_persist_vars_without_grad:
+
 save_persist_vars_without_grad
 ------------------------------
 
 ..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
     :noindex:
 
+.. _api_fluid_io_get_latest_checkpoint_serial:
+
 get_latest_checkpoint_serial
 ----------------------------
 
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 4157faae4c2fe0b803141d84db77b2baed8e3eed..d443c49657b92583e527035f49e74462cf41487d 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1,25 +1,31 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-======
-layers
-======
+============
+fluid.layers
+============
 
 control_flow
 ============
 
+.. _api_fluid_layers_split_lod_tensor:
+
 split_lod_tensor
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.split_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_merge_lod_tensor:
+
 merge_lod_tensor
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.merge_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_BlockGuard:
+
 BlockGuard
 ----------
 
@@ -27,6 +33,8 @@ BlockGuard
     :members:
     :noindex:
 
+.. _api_fluid_layers_BlockGuardWithCompletion:
+
 BlockGuardWithCompletion
 ------------------------
 
@@ -34,12 +42,7 @@ BlockGuardWithCompletion
     :members:
     :noindex:
 
-StaticRNNMemoryLink
--------------------
-
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
+.. _api_fluid_layers_WhileGuard:
 
 WhileGuard
 ----------
@@ -48,6 +51,8 @@ WhileGuard
     :members:
     :noindex:
 
+.. _api_fluid_layers_While:
+
 While
 -----
 
@@ -55,6 +60,8 @@ While
     :members:
     :noindex:
 
+.. _api_fluid_layers_Switch:
+
 Switch
 ------
 
@@ -62,78 +69,104 @@ Switch
     :members:
     :noindex:
 
+.. _api_fluid_layers_lod_rank_table:
+
 lod_rank_table
 --------------
 
 ..  autofunction:: paddle.fluid.layers.lod_rank_table
     :noindex:
 
+.. _api_fluid_layers_max_sequence_len:
+
 max_sequence_len
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
     :noindex:
 
+.. _api_fluid_layers_lod_tensor_to_array:
+
 lod_tensor_to_array
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
     :noindex:
 
+.. _api_fluid_layers_array_to_lod_tensor:
+
 array_to_lod_tensor
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_increment:
+
 increment
 ---------
 
 ..  autofunction:: paddle.fluid.layers.increment
     :noindex:
 
+.. _api_fluid_layers_array_write:
+
 array_write
 -----------
 
 ..  autofunction:: paddle.fluid.layers.array_write
     :noindex:
 
+.. _api_fluid_layers_create_array:
+
 create_array
 ------------
 
 ..  autofunction:: paddle.fluid.layers.create_array
     :noindex:
 
+.. _api_fluid_layers_less_than:
+
 less_than
 ---------
 
 ..  autofunction:: paddle.fluid.layers.less_than
     :noindex:
 
+.. _api_fluid_layers_equal:
+
 equal
 -----
 
 ..  autofunction:: paddle.fluid.layers.equal
     :noindex:
 
+.. _api_fluid_layers_array_read:
+
 array_read
 ----------
 
 ..  autofunction:: paddle.fluid.layers.array_read
     :noindex:
 
+.. _api_fluid_layers_shrink_memory:
+
 shrink_memory
 -------------
 
 ..  autofunction:: paddle.fluid.layers.shrink_memory
     :noindex:
 
+.. _api_fluid_layers_array_length:
+
 array_length
 ------------
 
 ..  autofunction:: paddle.fluid.layers.array_length
     :noindex:
 
+.. _api_fluid_layers_IfElse:
+
 IfElse
 ------
 
@@ -141,6 +174,8 @@ IfElse
     :members:
     :noindex:
 
+.. _api_fluid_layers_DynamicRNN:
+
 DynamicRNN
 ----------
 
@@ -148,6 +183,8 @@ DynamicRNN
     :members:
     :noindex:
 
+.. _api_fluid_layers_ConditionalBlock:
+
 ConditionalBlock
 ----------------
 
@@ -155,6 +192,8 @@ ConditionalBlock
     :members:
     :noindex:
 
+.. _api_fluid_layers_StaticRNN:
+
 StaticRNN
 ---------
 
@@ -162,12 +201,16 @@ StaticRNN
     :members:
     :noindex:
 
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
 reorder_lod_tensor_by_rank
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
     :noindex:
 
+.. _api_fluid_layers_ParallelDo:
+
 ParallelDo
 ----------
 
@@ -175,12 +218,16 @@ ParallelDo
     :members:
     :noindex:
 
+.. _api_fluid_layers_Print:
+
 Print
 -----
 
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+.. _api_fluid_layers_is_empty:
+
 is_empty
 --------
 
@@ -190,6 +237,8 @@ is_empty
 device
 ======
 
+.. _api_fluid_layers_get_places:
+
 get_places
 ----------
 
@@ -199,12 +248,16 @@ get_places
 io
 ==
 
+.. _api_fluid_layers_data:
+
 data
 ----
 
 ..  autofunction:: paddle.fluid.layers.data
     :noindex:
 
+.. _api_fluid_layers_BlockGuardServ:
+
 BlockGuardServ
 --------------
 
@@ -212,6 +265,8 @@ BlockGuardServ
     :members:
     :noindex:
 
+.. _api_fluid_layers_ListenAndServ:
+
 ListenAndServ
 -------------
 
@@ -219,60 +274,80 @@ ListenAndServ
     :members:
     :noindex:
 
+.. _api_fluid_layers_Send:
+
 Send
 ----
 
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+.. _api_fluid_layers_Recv:
+
 Recv
 ----
 
 ..  autofunction:: paddle.fluid.layers.Recv
     :noindex:
 
+.. _api_fluid_layers_open_recordio_file:
+
 open_recordio_file
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.open_recordio_file
     :noindex:
 
+.. _api_fluid_layers_open_files:
+
 open_files
 ----------
 
 ..  autofunction:: paddle.fluid.layers.open_files
     :noindex:
 
+.. _api_fluid_layers_read_file:
+
 read_file
 ---------
 
 ..  autofunction:: paddle.fluid.layers.read_file
     :noindex:
 
+.. _api_fluid_layers_shuffle:
+
 shuffle
 -------
 
 ..  autofunction:: paddle.fluid.layers.shuffle
     :noindex:
 
+.. _api_fluid_layers_batch:
+
 batch
 -----
 
 ..  autofunction:: paddle.fluid.layers.batch
     :noindex:
 
+.. _api_fluid_layers_double_buffer:
+
 double_buffer
 -------------
 
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
+.. _api_fluid_layers_random_data_generator:
+
 random_data_generator
 ---------------------
 
 ..  autofunction:: paddle.fluid.layers.random_data_generator
     :noindex:
 
+.. _api_fluid_layers_Preprocessor:
+
 Preprocessor
 ------------
 
@@ -280,6 +355,8 @@ Preprocessor
     :members:
     :noindex:
 
+.. _api_fluid_layers_load:
+
 load
 ----
 
@@ -289,584 +366,802 @@ load
 nn
 ==
 
+.. _api_fluid_layers_fc:
+
 fc
 --
 
 ..  autofunction:: paddle.fluid.layers.fc
     :noindex:
 
+.. _api_fluid_layers_embedding:
+
 embedding
 ---------
 
 ..  autofunction:: paddle.fluid.layers.embedding
     :noindex:
 
+.. _api_fluid_layers_dynamic_lstm:
+
 dynamic_lstm
 ------------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_lstm
     :noindex:
 
+.. _api_fluid_layers_dynamic_lstmp:
+
 dynamic_lstmp
 -------------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_lstmp
     :noindex:
 
+.. _api_fluid_layers_dynamic_gru:
+
 dynamic_gru
 -----------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_gru
     :noindex:
 
+.. _api_fluid_layers_gru_unit:
+
 gru_unit
 --------
 
 ..  autofunction:: paddle.fluid.layers.gru_unit
     :noindex:
 
+.. _api_fluid_layers_linear_chain_crf:
+
 linear_chain_crf
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.linear_chain_crf
     :noindex:
 
+.. _api_fluid_layers_crf_decoding:
+
 crf_decoding
 ------------
 
 ..  autofunction:: paddle.fluid.layers.crf_decoding
     :noindex:
 
+.. _api_fluid_layers_cos_sim:
+
 cos_sim
 -------
 
 ..  autofunction:: paddle.fluid.layers.cos_sim
     :noindex:
 
+.. _api_fluid_layers_cross_entropy:
+
 cross_entropy
 -------------
 
 ..  autofunction:: paddle.fluid.layers.cross_entropy
     :noindex:
 
+.. _api_fluid_layers_square_error_cost:
+
 square_error_cost
 -----------------
 
 ..  autofunction:: paddle.fluid.layers.square_error_cost
     :noindex:
 
+.. _api_fluid_layers_chunk_eval:
+
 chunk_eval
 ----------
 
 ..  autofunction:: paddle.fluid.layers.chunk_eval
     :noindex:
 
+.. _api_fluid_layers_sequence_conv:
+
 sequence_conv
 -------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_conv
     :noindex:
 
+.. _api_fluid_layers_conv2d:
+
 conv2d
 ------
 
 ..  autofunction:: paddle.fluid.layers.conv2d
     :noindex:
 
+.. _api_fluid_layers_conv3d:
+
 conv3d
 ------
 
 ..  autofunction:: paddle.fluid.layers.conv3d
     :noindex:
 
+.. _api_fluid_layers_sequence_pool:
+
 sequence_pool
 -------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_pool
     :noindex:
 
+.. _api_fluid_layers_sequence_softmax:
+
 sequence_softmax
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_softmax
     :noindex:
 
+.. _api_fluid_layers_softmax:
+
 softmax
 -------
 
 ..  autofunction:: paddle.fluid.layers.softmax
     :noindex:
 
+.. _api_fluid_layers_pool2d:
+
 pool2d
 ------
 
 ..  autofunction:: paddle.fluid.layers.pool2d
     :noindex:
 
+.. _api_fluid_layers_pool3d:
+
 pool3d
 ------
 
 ..  autofunction:: paddle.fluid.layers.pool3d
     :noindex:
 
+.. _api_fluid_layers_batch_norm:
+
 batch_norm
 ----------
 
 ..  autofunction:: paddle.fluid.layers.batch_norm
     :noindex:
 
+.. _api_fluid_layers_beam_search_decode:
+
 beam_search_decode
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.beam_search_decode
     :noindex:
 
+.. _api_fluid_layers_conv2d_transpose:
+
 conv2d_transpose
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
     :noindex:
 
+.. _api_fluid_layers_conv3d_transpose:
+
 conv3d_transpose
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.conv3d_transpose
     :noindex:
 
+.. _api_fluid_layers_sequence_expand:
+
 sequence_expand
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_expand
     :noindex:
 
+.. _api_fluid_layers_lstm_unit:
+
 lstm_unit
 ---------
 
 ..  autofunction:: paddle.fluid.layers.lstm_unit
     :noindex:
 
+.. _api_fluid_layers_reduce_sum:
+
 reduce_sum
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_sum
     :noindex:
 
+.. _api_fluid_layers_reduce_mean:
+
 reduce_mean
 -----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_mean
     :noindex:
 
+.. _api_fluid_layers_reduce_max:
+
 reduce_max
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_max
     :noindex:
 
+.. _api_fluid_layers_reduce_min:
+
 reduce_min
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_min
     :noindex:
 
+.. _api_fluid_layers_reduce_prod:
+
 reduce_prod
 -----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_prod
     :noindex:
 
+.. _api_fluid_layers_sequence_first_step:
+
 sequence_first_step
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_first_step
     :noindex:
 
+.. _api_fluid_layers_sequence_last_step:
+
 sequence_last_step
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_last_step
     :noindex:
 
+.. _api_fluid_layers_dropout:
+
 dropout
 -------
 
 ..  autofunction:: paddle.fluid.layers.dropout
     :noindex:
 
+.. _api_fluid_layers_split:
+
 split
 -----
 
 ..  autofunction:: paddle.fluid.layers.split
     :noindex:
 
+.. _api_fluid_layers_ctc_greedy_decoder:
+
 ctc_greedy_decoder
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
     :noindex:
 
+.. _api_fluid_layers_edit_distance:
+
 edit_distance
 -------------
 
 ..  autofunction:: paddle.fluid.layers.edit_distance
     :noindex:
 
+.. _api_fluid_layers_l2_normalize:
+
 l2_normalize
 ------------
 
 ..  autofunction:: paddle.fluid.layers.l2_normalize
     :noindex:
 
+.. _api_fluid_layers_matmul:
+
 matmul
 ------
 
 ..  autofunction:: paddle.fluid.layers.matmul
     :noindex:
 
+.. _api_fluid_layers_topk:
+
 topk
 ----
 
 ..  autofunction:: paddle.fluid.layers.topk
     :noindex:
 
+.. _api_fluid_layers_warpctc:
+
 warpctc
 -------
 
 ..  autofunction:: paddle.fluid.layers.warpctc
     :noindex:
 
+.. _api_fluid_layers_sequence_reshape:
+
 sequence_reshape
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_reshape
     :noindex:
 
+.. _api_fluid_layers_transpose:
+
 transpose
 ---------
 
 ..  autofunction:: paddle.fluid.layers.transpose
     :noindex:
 
+.. _api_fluid_layers_im2sequence:
+
 im2sequence
 -----------
 
 ..  autofunction:: paddle.fluid.layers.im2sequence
     :noindex:
 
+.. _api_fluid_layers_nce:
+
 nce
 ---
 
 ..  autofunction:: paddle.fluid.layers.nce
     :noindex:
 
+.. _api_fluid_layers_beam_search:
+
 beam_search
 -----------
 
 ..  autofunction:: paddle.fluid.layers.beam_search
     :noindex:
 
+.. _api_fluid_layers_row_conv:
+
 row_conv
 --------
 
 ..  autofunction:: paddle.fluid.layers.row_conv
     :noindex:
 
+.. _api_fluid_layers_multiplex:
+
 multiplex
 ---------
 
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+.. _api_fluid_layers_layer_norm:
+
 layer_norm
 ----------
 
 ..  autofunction:: paddle.fluid.layers.layer_norm
     :noindex:
 
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
 softmax_with_cross_entropy
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
     :noindex:
 
+.. _api_fluid_layers_smooth_l1:
+
 smooth_l1
 ---------
 
 ..  autofunction:: paddle.fluid.layers.smooth_l1
     :noindex:
 
+.. _api_fluid_layers_one_hot:
+
 one_hot
 -------
 
 ..  autofunction:: paddle.fluid.layers.one_hot
     :noindex:
 
+.. _api_fluid_layers_autoincreased_step_counter:
+
 autoincreased_step_counter
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
     :noindex:
 
+.. _api_fluid_layers_reshape:
+
 reshape
 -------
 
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+.. _api_fluid_layers_lod_reset:
+
 lod_reset
 ---------
 
 ..  autofunction:: paddle.fluid.layers.lod_reset
     :noindex:
 
+.. _api_fluid_layers_lrn:
+
 lrn
 ---
 
 ..  autofunction:: paddle.fluid.layers.lrn
     :noindex:
 
+.. _api_fluid_layers_pad:
+
 pad
 ---
 
 ..  autofunction:: paddle.fluid.layers.pad
     :noindex:
 
+.. _api_fluid_layers_label_smooth:
+
 label_smooth
 ------------
 
 ..  autofunction:: paddle.fluid.layers.label_smooth
     :noindex:
 
+.. _api_fluid_layers_roi_pool:
+
 roi_pool
 --------
 
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
+.. _api_fluid_layers_dice_loss:
+
 dice_loss
 ---------
 
 ..  autofunction:: paddle.fluid.layers.dice_loss
     :noindex:
 
+.. _api_fluid_layers_image_resize:
+
 image_resize
 ------------
 
 ..  autofunction:: paddle.fluid.layers.image_resize
     :noindex:
 
+.. _api_fluid_layers_image_resize_short:
+
 image_resize_short
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.image_resize_short
     :noindex:
 
+.. _api_fluid_layers_resize_bilinear:
+
 resize_bilinear
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.resize_bilinear
     :noindex:
 
+.. _api_fluid_layers_gather:
+
 gather
 ------
 
 ..  autofunction:: paddle.fluid.layers.gather
     :noindex:
 
+.. _api_fluid_layers_random_crop:
+
 random_crop
 -----------
 
 ..  autofunction:: paddle.fluid.layers.random_crop
     :noindex:
 
+.. _api_fluid_layers_mean_iou:
+
 mean_iou
 --------
 
 ..  autofunction:: paddle.fluid.layers.mean_iou
     :noindex:
 
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
 ops
 ===
 
+.. _api_fluid_layers_mean:
+
 mean
 ----
 
 ..  autofunction:: paddle.fluid.layers.mean
     :noindex:
 
+.. _api_fluid_layers_mul:
+
 mul
 ---
 
 ..  autofunction:: paddle.fluid.layers.mul
     :noindex:
 
+.. _api_fluid_layers_scale:
+
 scale
 -----
 
 ..  autofunction:: paddle.fluid.layers.scale
     :noindex:
 
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
 sigmoid_cross_entropy_with_logits
 ---------------------------------
 
 ..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
     :noindex:
 
+.. _api_fluid_layers_elementwise_add:
+
 elementwise_add
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_add
     :noindex:
 
+.. _api_fluid_layers_elementwise_div:
+
 elementwise_div
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_div
     :noindex:
 
+.. _api_fluid_layers_elementwise_sub:
+
 elementwise_sub
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_sub
     :noindex:
 
+.. _api_fluid_layers_elementwise_mul:
+
 elementwise_mul
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_mul
     :noindex:
 
+.. _api_fluid_layers_elementwise_max:
+
 elementwise_max
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_max
     :noindex:
 
+.. _api_fluid_layers_elementwise_min:
+
 elementwise_min
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_min
     :noindex:
 
+.. _api_fluid_layers_elementwise_pow:
+
 elementwise_pow
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_pow
     :noindex:
 
+.. _api_fluid_layers_clip:
+
 clip
 ----
 
 ..  autofunction:: paddle.fluid.layers.clip
     :noindex:
 
+.. _api_fluid_layers_clip_by_norm:
+
 clip_by_norm
 ------------
 
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
     :noindex:
 
+.. _api_fluid_layers_logical_and:
+
 logical_and
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_and
     :noindex:
 
+.. _api_fluid_layers_logical_or:
+
 logical_or
 ----------
 
 ..  autofunction:: paddle.fluid.layers.logical_or
     :noindex:
 
+.. _api_fluid_layers_logical_xor:
+
 logical_xor
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_xor
     :noindex:
 
+.. _api_fluid_layers_logical_not:
+
 logical_not
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_not
     :noindex:
 
+.. _api_fluid_layers_uniform_random_batch_size_like:
+
 uniform_random_batch_size_like
 ------------------------------
 
 ..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
     :noindex:
 
+.. _api_fluid_layers_gaussian_random:
+
 gaussian_random
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.gaussian_random
     :noindex:
 
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
 gaussian_random_batch_size_like
 -------------------------------
 
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
     :noindex:
 
+.. _api_fluid_layers_scatter:
+
 scatter
 -------
 
 ..  autofunction:: paddle.fluid.layers.scatter
     :noindex:
 
+.. _api_fluid_layers_sum:
+
 sum
 ---
 
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
+.. _api_fluid_layers_slice:
+
 slice
 -----
 
 ..  autofunction:: paddle.fluid.layers.slice
     :noindex:
 
+.. _api_fluid_layers_polygon_box_transform:
+
 polygon_box_transform
 ---------------------
 
 ..  autofunction:: paddle.fluid.layers.polygon_box_transform
     :noindex:
 
+.. _api_fluid_layers_shape:
+
 shape
 -----
 
 ..  autofunction:: paddle.fluid.layers.shape
     :noindex:
 
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_maxout:
+
 maxout
 ------
 
 ..  autofunction:: paddle.fluid.layers.maxout
     :noindex:
 
+.. _api_fluid_layers_sigmoid:
+
 sigmoid
 -------
 
 ..  autofunction:: paddle.fluid.layers.sigmoid
     :noindex:
 
+.. _api_fluid_layers_logsigmoid:
+
 logsigmoid
 ----------
 
 ..  autofunction:: paddle.fluid.layers.logsigmoid
     :noindex:
 
+.. _api_fluid_layers_exp:
+
 exp
 ---
 
 ..  autofunction:: paddle.fluid.layers.exp
     :noindex:
 
-relu
-----
-
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
+.. _api_fluid_layers_tanh:
 
 tanh
 ----
@@ -874,71 +1169,87 @@ tanh
 ..  autofunction:: paddle.fluid.layers.tanh
     :noindex:
 
+.. _api_fluid_layers_tanh_shrink:
+
 tanh_shrink
 -----------
 
 ..  autofunction:: paddle.fluid.layers.tanh_shrink
     :noindex:
 
+.. _api_fluid_layers_softshrink:
+
 softshrink
 ----------
 
 ..  autofunction:: paddle.fluid.layers.softshrink
     :noindex:
 
+.. _api_fluid_layers_sqrt:
+
 sqrt
 ----
 
 ..  autofunction:: paddle.fluid.layers.sqrt
     :noindex:
 
+.. _api_fluid_layers_abs:
+
 abs
 ---
 
 ..  autofunction:: paddle.fluid.layers.abs
     :noindex:
 
+.. _api_fluid_layers_ceil:
+
 ceil
 ----
 
 ..  autofunction:: paddle.fluid.layers.ceil
     :noindex:
 
+.. _api_fluid_layers_floor:
+
 floor
 -----
 
 ..  autofunction:: paddle.fluid.layers.floor
     :noindex:
 
+.. _api_fluid_layers_cos:
+
 cos
 ---
 
 ..  autofunction:: paddle.fluid.layers.cos
     :noindex:
 
+.. _api_fluid_layers_sin:
+
 sin
 ---
 
 ..  autofunction:: paddle.fluid.layers.sin
     :noindex:
 
+.. _api_fluid_layers_round:
+
 round
 -----
 
 ..  autofunction:: paddle.fluid.layers.round
     :noindex:
 
+.. _api_fluid_layers_reciprocal:
+
 reciprocal
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reciprocal
     :noindex:
 
-log
----
-
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
+.. _api_fluid_layers_square:
 
 square
 ------
@@ -946,90 +1257,120 @@ square
 ..  autofunction:: paddle.fluid.layers.square
     :noindex:
 
+.. _api_fluid_layers_softplus:
+
 softplus
 --------
 
 ..  autofunction:: paddle.fluid.layers.softplus
     :noindex:
 
+.. _api_fluid_layers_softsign:
+
 softsign
 --------
 
 ..  autofunction:: paddle.fluid.layers.softsign
     :noindex:
 
+.. _api_fluid_layers_brelu:
+
 brelu
 -----
 
 ..  autofunction:: paddle.fluid.layers.brelu
     :noindex:
 
+.. _api_fluid_layers_leaky_relu:
+
 leaky_relu
 ----------
 
 ..  autofunction:: paddle.fluid.layers.leaky_relu
     :noindex:
 
+.. _api_fluid_layers_soft_relu:
+
 soft_relu
 ---------
 
 ..  autofunction:: paddle.fluid.layers.soft_relu
     :noindex:
 
+.. _api_fluid_layers_elu:
+
 elu
 ---
 
 ..  autofunction:: paddle.fluid.layers.elu
     :noindex:
 
+.. _api_fluid_layers_relu6:
+
 relu6
 -----
 
 ..  autofunction:: paddle.fluid.layers.relu6
     :noindex:
 
+.. _api_fluid_layers_pow:
+
 pow
 ---
 
 ..  autofunction:: paddle.fluid.layers.pow
     :noindex:
 
+.. _api_fluid_layers_stanh:
+
 stanh
 -----
 
 ..  autofunction:: paddle.fluid.layers.stanh
     :noindex:
 
+.. _api_fluid_layers_hard_sigmoid:
+
 hard_sigmoid
 ------------
 
 ..  autofunction:: paddle.fluid.layers.hard_sigmoid
     :noindex:
 
+.. _api_fluid_layers_swish:
+
 swish
 -----
 
 ..  autofunction:: paddle.fluid.layers.swish
     :noindex:
 
+.. _api_fluid_layers_uniform_random:
+
 uniform_random
 --------------
 
 ..  autofunction:: paddle.fluid.layers.uniform_random
     :noindex:
 
+.. _api_fluid_layers_hard_shrink:
+
 hard_shrink
 -----------
 
 ..  autofunction:: paddle.fluid.layers.hard_shrink
     :noindex:
 
+.. _api_fluid_layers_cumsum:
+
 cumsum
 ------
 
 ..  autofunction:: paddle.fluid.layers.cumsum
     :noindex:
 
+.. _api_fluid_layers_thresholded_relu:
+
 thresholded_relu
 ----------------
 
@@ -1039,198 +1380,391 @@ thresholded_relu
 tensor
 ======
 
+.. _api_fluid_layers_create_tensor:
+
 create_tensor
 -------------
 
 ..  autofunction:: paddle.fluid.layers.create_tensor
     :noindex:
 
+.. _api_fluid_layers_create_parameter:
+
 create_parameter
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.create_parameter
     :noindex:
 
+.. _api_fluid_layers_create_global_var:
+
 create_global_var
 -----------------
 
 ..  autofunction:: paddle.fluid.layers.create_global_var
     :noindex:
 
+.. _api_fluid_layers_cast:
+
 cast
 ----
 
 ..  autofunction:: paddle.fluid.layers.cast
     :noindex:
 
+.. _api_fluid_layers_concat:
+
 concat
 ------
 
 ..  autofunction:: paddle.fluid.layers.concat
     :noindex:
 
+.. _api_fluid_layers_sums:
+
 sums
 ----
 
 ..  autofunction:: paddle.fluid.layers.sums
     :noindex:
 
+.. _api_fluid_layers_assign:
+
 assign
 ------
 
 ..  autofunction:: paddle.fluid.layers.assign
     :noindex:
 
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
 fill_constant_batch_size_like
 -----------------------------
 
 ..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
+.. _api_fluid_layers_fill_constant:
+
 fill_constant
 -------------
 
 ..  autofunction:: paddle.fluid.layers.fill_constant
     :noindex:
 
+.. _api_fluid_layers_argmin:
+
 argmin
 ------
 
 ..  autofunction:: paddle.fluid.layers.argmin
     :noindex:
 
+.. _api_fluid_layers_argmax:
+
 argmax
 ------
 
 ..  autofunction:: paddle.fluid.layers.argmax
     :noindex:
 
+.. _api_fluid_layers_argsort:
+
 argsort
-------
+-------
 
 ..  autofunction:: paddle.fluid.layers.argsort
     :noindex:
 
+.. _api_fluid_layers_ones:
+
 ones
 ----
 
 ..  autofunction:: paddle.fluid.layers.ones
     :noindex:
 
+.. _api_fluid_layers_zeros:
+
 zeros
 -----
 
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
+
 detection
 =========
 
+.. _api_fluid_layers_prior_box:
+
 prior_box
 ---------
 
 ..  autofunction:: paddle.fluid.layers.prior_box
     :noindex:
 
+.. _api_fluid_layers_multi_box_head:
+
 multi_box_head
 --------------
 
 ..  autofunction:: paddle.fluid.layers.multi_box_head
     :noindex:
 
+.. _api_fluid_layers_bipartite_match:
+
 bipartite_match
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.bipartite_match
     :noindex:
 
+.. _api_fluid_layers_target_assign:
+
 target_assign
 -------------
 
 ..  autofunction:: paddle.fluid.layers.target_assign
     :noindex:
 
+.. _api_fluid_layers_detection_output:
+
 detection_output
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.detection_output
     :noindex:
 
+.. _api_fluid_layers_ssd_loss:
+
 ssd_loss
 --------
 
 ..  autofunction:: paddle.fluid.layers.ssd_loss
     :noindex:
 
+.. _api_fluid_layers_detection_map:
+
 detection_map
 -------------
 
 ..  autofunction:: paddle.fluid.layers.detection_map
     :noindex:
 
+.. _api_fluid_layers_iou_similarity:
+
 iou_similarity
 --------------
 
 ..  autofunction:: paddle.fluid.layers.iou_similarity
     :noindex:
 
+.. _api_fluid_layers_box_coder:
+
 box_coder
 ---------
 
 ..  autofunction:: paddle.fluid.layers.box_coder
     :noindex:
 
-learning_rate_scheduler
-=======================
+metric_op
+=========
 
-exponential_decay
------------------
+.. _api_fluid_layers_accuracy:
 
-..  autofunction:: paddle.fluid.layers.exponential_decay
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
     :noindex:
 
-natural_exp_decay
------------------
+.. _api_fluid_layers_auc:
 
-..  autofunction:: paddle.fluid.layers.natural_exp_decay
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
     :noindex:
 
-inverse_time_decay
-------------------
+tensor
+======
 
-..  autofunction:: paddle.fluid.layers.inverse_time_decay
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
     :noindex:
 
-polynomial_decay
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
 ----------------
 
-..  autofunction:: paddle.fluid.layers.polynomial_decay
+..  autofunction:: paddle.fluid.layers.create_parameter
     :noindex:
 
-piecewise_decay
----------------
+.. _api_fluid_layers_create_global_var:
 
-..  autofunction:: paddle.fluid.layers.piecewise_decay
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
     :noindex:
 
-noam_decay
-----------
+.. _api_fluid_layers_cast:
 
-..  autofunction:: paddle.fluid.layers.noam_decay
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
     :noindex:
 
-metric
-======
+.. _api_fluid_layers_concat:
 
-accuracy
---------
+concat
+------
 
-..  autofunction:: paddle.fluid.layers.accuracy
+..  autofunction:: paddle.fluid.layers.concat
     :noindex:
 
-auc
----
+.. _api_fluid_layers_sums:
 
-..  autofunction:: paddle.fluid.layers.auc
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
     :noindex:
 
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
index ddf07775d7ea293acd421b8549d03b277ff0611d..0f54b2e2eb7ead353215c5dbd529293794e37123 100644
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=======
-metrics
-=======
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
 
 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
     :members:
     :noindex:
 
+.. _api_fluid_metrics_CompositeMetric:
+
 CompositeMetric
 ---------------
 
@@ -19,6 +23,26 @@ CompositeMetric
     :members:
     :noindex:
 
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
 Accuracy
 --------
 
@@ -26,6 +50,8 @@ Accuracy
     :members:
     :noindex:
 
+.. _api_fluid_metrics_ChunkEvaluator:
+
 ChunkEvaluator
 --------------
 
@@ -33,6 +59,8 @@ ChunkEvaluator
     :members:
     :noindex:
 
+.. _api_fluid_metrics_EditDistance:
+
 EditDistance
 ------------
 
@@ -40,6 +68,8 @@ EditDistance
     :members:
     :noindex:
 
+.. _api_fluid_metrics_DetectionMAP:
+
 DetectionMAP
 ------------
 
@@ -47,6 +77,8 @@ DetectionMAP
     :members:
     :noindex:
 
+.. _api_fluid_metrics_Auc:
+
 Auc
 ---
 
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..059733af18517257b6821d95fd628a9e13e6e98e 100644
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-nets
-====
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
 
 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
     :noindex:
 
+.. _api_fluid_nets_sequence_conv_pool:
+
 sequence_conv_pool
 ------------------
 
 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
     :noindex:
 
+.. _api_fluid_nets_glu:
+
 glu
 ---
 
 ..  autofunction:: paddle.fluid.nets.glu
     :noindex:
 
+.. _api_fluid_nets_scaled_dot_product_attention:
+
 scaled_dot_product_attention
 ----------------------------
 
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005..8d792120f2f16a8c92606b343eb4c3d4368bed14 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
 
 SGD
 ---
@@ -12,6 +14,8 @@ SGD
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Momentum:
+
 Momentum
 --------
 
@@ -19,6 +23,8 @@ Momentum
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adagrad:
+
 Adagrad
 -------
 
@@ -26,6 +32,8 @@ Adagrad
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adam:
+
 Adam
 ----
 
@@ -33,6 +41,8 @@ Adam
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adamax:
+
 Adamax
 ------
 
@@ -40,6 +50,8 @@ Adamax
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_DecayedAdagrad:
+
 DecayedAdagrad
 --------------
 
@@ -47,6 +59,17 @@ DecayedAdagrad
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
 SGDOptimizer
 ------------
 
@@ -54,6 +77,8 @@ SGDOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_MomentumOptimizer:
+
 MomentumOptimizer
 -----------------
 
@@ -61,6 +86,8 @@ MomentumOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdagradOptimizer:
+
 AdagradOptimizer
 ----------------
 
@@ -68,6 +95,8 @@ AdagradOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdamOptimizer:
+
 AdamOptimizer
 -------------
 
@@ -75,6 +104,8 @@ AdamOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
 AdamaxOptimizer
 ---------------
 
@@ -82,6 +113,8 @@ AdamaxOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
 DecayedAdagradOptimizer
 -----------------------
 
@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
 RMSPropOptimizer
 ----------------
 
@@ -96,6 +131,17 @@ RMSPropOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
 Adadelta
 --------
 
@@ -103,6 +149,8 @@ Adadelta
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_ModelAverage:
+
 ModelAverage
 ------------
 
@@ -110,6 +158,8 @@ ModelAverage
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Optimizer:
+
 Optimizer
 ---------
 
@@ -117,3 +167,12 @@ Optimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..33035bbc7ca5c8d000adeaf1cb79806a3ea64604 100644
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
 
 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
     :members:
     :noindex:
 
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
 WeightNormParamAttr
 -------------------
 
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 39fda65863471a78895503184848a754828b71a1..c750a2d588df56728ac7f73051ab7a9e44dee232 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
 
 cuda_profiler
 -------------
@@ -11,24 +13,32 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
     :noindex:
 
+.. _api_fluid_profiler_reset_profiler:
+
 reset_profiler
 --------------
 
 ..  autofunction:: paddle.fluid.profiler.reset_profiler
     :noindex:
 
+.. _api_fluid_profiler_profiler:
+
 profiler
 --------
 
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+.. _api_fluid_profiler_start_profiler:
+
 start_profiler
 --------------
 
 ..  autofunction:: paddle.fluid.profiler.start_profiler
     :noindex:
 
+.. _api_fluid_profiler_stop_profiler:
+
 stop_profiler
 -------------
 
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index 756bc53baa0625aef48dad0c35e7ae57421a70d0..987eaea903520d91c284c8da7a8cb066a1648069 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:
 
 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
     :noindex:
 
-WeightDecayRegularizer
-----------------------
-
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
+.. _api_fluid_regularizer_L1Decay:
 
 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L2Decay:
+
 L2Decay
 -------
 
@@ -32,6 +31,8 @@ L2Decay
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
 L1DecayRegularizer
 ------------------
 
@@ -39,6 +40,8 @@ L1DecayRegularizer
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
 L2DecayRegularizer
 ------------------
 
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
index b3535b449eb0e5ac6563256ddac3bf4a27fd8ce6..943d39331d26c05764c90cb24f6774997c976bfe 100644
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==========
-transpiler
-==========
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
 
 DistributeTranspiler
 --------------------
@@ -12,12 +14,7 @@ DistributeTranspiler
     :members:
     :noindex:
 
-InferenceTranspiler
--------------------
-
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
+.. _api_fluid_transpiler_memory_optimize:
 
 memory_optimize
 ---------------
@@ -25,12 +22,16 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
     :noindex:
 
+.. _api_fluid_transpiler_release_memory:
+
 release_memory
 --------------
 
 ..  autofunction:: paddle.fluid.transpiler.release_memory
     :noindex:
 
+.. _api_fluid_transpiler_HashName:
+
 HashName
 --------
 
@@ -38,9 +39,12 @@ HashName
     :members:
     :noindex:
 
+.. _api_fluid_transpiler_RoundRobin:
+
 RoundRobin
 ----------
 
 ..  autoclass:: paddle.fluid.transpiler.RoundRobin
     :members:
     :noindex:
+
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index d606d7a790b4b0dc18553f2220d39cec8aa619ec..748488f6d5f2f1272e87b89047570632418da8dc 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
 
 ## Slicing of LoD Tensors
 
+
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
 For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
   ||
 ```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
     auto kernel_type_for_var = this->GetKernelTypeForVar(...);
     if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
       auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
+      auto* out = TransformData(expected_kernel_key,
                                 kernel_type_for_var,
                                 *tensor_in);
-      CopyVariableWithTensor(...);
+      SetTensorToVariable(...);
     }
   }
 
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
index 9b55a66ded8b48f7105c05f1462839a72ab5f904..7fb0883dd937465d15479b29df95078edb50e069 100644
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -1,4 +1,4 @@
-## 堆内存分析和优化
+# 堆内存分析和优化
 
 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
 
@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
 
 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
 
-## 使用流程
-#### 环境
+## 环境
+
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
 
-#### 使用流程
+## 使用流程
 
 - 安装google-perftools
 
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d061e1c00d2ca0194153730a39486b8357fa5b0
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline_en.md
similarity index 100%
rename from doc/fluid/howto/optimization/timeline.md
rename to doc/fluid/howto/optimization/timeline_en.md
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 0f56d648b1939e1d6af3368bb2423477a3b638fc..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -19,6 +19,9 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
@@ -43,6 +46,10 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
@@ -50,17 +57,30 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
-if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+
+if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
     target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
+    if (WITH_TESTING)
+        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                   DEPS inference_anakin_api)
-    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+     endif(WITH_TESTING)
 endif()
 
 if(WITH_TESTING)
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index 7b0fa77ad13c19f177e5b2446bcda6551471e45f..ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -14,3 +14,48 @@
 #
 
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
+
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+
+set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
+set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
+
+function(inference_download_test_demo TARGET)
+    if (NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs URL)
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
+    message(STATUS "inference demo ${test_dir}")
+
+    if(NOT EXISTS "${test_dir}")
+        message(STATUS "Download ${TARGET} model from ${tests_URL}")
+        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
+    endif()
+
+    cc_test(${TARGET} SRCS "${tests_SRCS}"
+        DEPS paddle_inference_api paddle_fluid
+        ARGS --data=${test_dir}/data.txt
+             --modeldir=${test_dir}/model
+             --refer=${test_dir}/result.txt)
+endfunction()
+
+# disable mobilenet test
+#inference_download_test_demo(mobilenet_inference_demo
+#    SRCS vis_demo.cc
+#    URL ${URL_ROOT}mobilenet.tar.gz)
+inference_download_test_demo(se_resnext50_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}se_resnext50.tar.gz)
+inference_download_test_demo(ocr_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1d256660299a68dc5d9d73dbe4a401a0e7d9680
--- /dev/null
+++ b/paddle/contrib/inference/demo/README.md
@@ -0,0 +1,36 @@
+# Infernce Demos
+
+Input data format:
+
+- Each line contains a single record
+- Each record's format is
+
+```
+<space splitted floats as data>\t<space splitted ints as shape>
+```
+
+Follow the C++ codes in `vis_demo.cc`.
+
+## MobileNet
+
+To execute the demo, simply run
+
+```sh
+./mobilenet_inference_demo --modeldir <model> --data <datafile>
+```
+
+## SE-ResNeXt-50
+
+To execute the demo, simply run
+
+```sh
+./se_resnext50_inference_demo --modeldir <model> --data <datafile>
+```
+
+## OCR
+
+To execute the demo, simply run
+
+```sh
+./ocr_inference_demo --modeldir <model> --data <datafile>
+```
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 2a4bfc87069b9fd8ece58dde210a6cb8344da536..c253014642f39a042430992548a285cc7078a959 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+
 namespace paddle {
 namespace demo {
 
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/contrib/inference/demo/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1
--- /dev/null
+++ b/paddle/contrib/inference/demo/utils.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+static void split(const std::string& str,
+                  char sep,
+                  std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+/*
+ * Get a summary of a PaddleTensor content.
+ */
+static std::string SummaryTensor(const PaddleTensor& tensor) {
+  std::stringstream ss;
+  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+
+  ss << "data[:10]\t";
+  switch (tensor.dtype) {
+    case PaddleDType::INT64: {
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<float*>(tensor.data.data())[i] << " ";
+      }
+      break;
+  }
+  return ss.str();
+}
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/contrib/inference/demo/vis_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45575f9a862de430236ae20cf498e542a45b1f4b
--- /dev/null
+++ b/paddle/contrib/inference/demo/vis_demo.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo for mobilenet, se-resnext50 and ocr.
+ */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/contrib/inference/demo/utils.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data,
+    "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+
+Record ProcessALine(const std::string& line) {
+  LOG(INFO) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  LOG(INFO) << "data size " << record.data.size();
+  LOG(INFO) << "data shape size " << record.shape.size();
+  return record;
+}
+
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  LOG(INFO) << "predictor output numel " << numel;
+  LOG(INFO) << "reference output numel " << refer.data.size();
+  EXPECT_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_NEAR(
+            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+      }
+      break;
+  }
+}
+
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main(bool use_gpu) {
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+#ifdef PADDLE_WITH_CUDA
+  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
+#endif
+
+  LOG(INFO) << "init predictor";
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  LOG(INFO) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input{
+      .name = "xx",
+      .shape = record.shape,
+      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
+      .dtype = PaddleDType::FLOAT32};
+
+  LOG(INFO) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output);
+
+  LOG(INFO) << "output.size " << output.size();
+  auto& tensor = output.front();
+  LOG(INFO) << "output: " << SummaryTensor(tensor);
+
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+
+TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
+#endif
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md
index 563b696143de9cbf67db38048bbd2f7c11b3a66e..eb92885052a453d8c837bbf6f6e984efb509332a 100644
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/contrib/inference/high_level_api.md
@@ -1,10 +1,10 @@
 # Inference High-level APIs
-This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
+This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
 
-The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
+The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
 
 ## PaddleTensor
-We provide the `PaddleTensor` data structure is to give a general tensor interface.
+We provide the `PaddleTensor` data structure to give a general tensor interface.
 
 The definition is 
 
@@ -17,18 +17,19 @@ struct PaddleTensor {
 };
 ```
 
-The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. 
-The `name` field is used to specify the name of input variable, 
-that is important when there are multiple inputs and need to distiuish which variable to set.
+The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. 
+The `name` field is used to specify the name of an input variable, 
+that is important when there are multiple inputs and need to distinguish which variable to set.
 
 ## engine
-The inference APIs has two different underlying implementation, currently there are two valid engines:
+The inference APIs has two different underlying engines
 
 - the native engine, which is consists of the native operators and framework,
-- the Anakin engine, which is a Anakin library embeded.
+- the Anakin engine, which has an Anakin library embedded.
 
 The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
-but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
+the Anakin engine is faster for some model, 
+but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
 
 ```c++
 enum class PaddleEngineKind {
@@ -38,10 +39,10 @@ enum class PaddleEngineKind {
 ```
 
 ## PaddlePredictor and how to create one
-The main interface is `PaddlePredictor`, there are following methods 
+The main interface is `PaddlePredictor,` there are following methods 
 
 - `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
-  - take inputs and output `output_data`
+  - take inputs and output `output_data.`
 - `Clone` to clone a predictor from an existing one, with model parameter shared.
 
 There is a factory method to help create a predictor, and the user takes the ownership of this object.
@@ -51,9 +52,9 @@ template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 ```
 
-By specifying the engine kind and config, one can get an specific implementation.
+By specifying the engine kind and config, one can get a specific implementation.
 
 ## Reference
 
 - [paddle_inference_api.h](./paddle_inference_api.h)
-- [demos](./demo)
+- [some demos](./demo)
diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/contrib/inference/high_level_api_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..a57f015a4e44d43ee4e475cf606faa6f05e095fa
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api_cn.md
@@ -0,0 +1,87 @@
+# Paddle 预测 API
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+
+预测库包含:
+
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
+
+下面是详细的一些 API 概念介绍
+
+## PaddleTensor
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+
+## engine
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
+- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
+- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,       // Use the native Fluid facility.
+  kAnakin,           // Use Anakin for inference.
+  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+};
+```
+
+## 预测部署过程
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+```c++
+#include "paddle_inference_api.h"
+
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+
+编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
+
+## 详细代码参考
+
+- [inference demos](./demo)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
index dc2842ae0eeb5592b6d4571b70df162886aee7a2..ea46b3006f8d0964cc8229d3683ee7b602d6ef0d 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -16,6 +16,19 @@ limitations under the License. */
 
 namespace paddle {
 
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      //
+      assert(false);
+      return -1;
+  }
+}
+
 PaddleBuf::PaddleBuf(PaddleBuf&& other)
     : data_(other.data_),
       length_(other.length_),
@@ -62,4 +75,4 @@ void PaddleBuf::Free() {
   }
 }
 
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 38e3cc21413b9ab715b84f278f00b9df23cb7682..b8ba2d14a5c161d491d838888ea14b776f769f23 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -15,7 +15,7 @@ limitations under the License. */
 /*
  * This file contains the definition of a simple Inference API for Paddle.
  *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
  * might release another API.
  */
 
@@ -73,12 +73,12 @@ struct PaddleTensor {
 };
 
 enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
+  kNative = 0,         // Use the native Fluid facility.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
   // TODO(Superjomn) support following engines latter.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
 };
 
 /*
@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
   int max_batch_size{-1};
 };
 
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
@@ -140,4 +145,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index d9129a704bc289ce1d416474537fc9234a07e5b8..b1e5b875981e0142f6970cf6864b7b598743654b 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
     LOG(ERROR) << "fail to load inference model.";
     return false;
   }
+
   ctx_ = executor_->Prepare(*inference_program_, 0);
   executor_->CreateVariables(
       *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
     feed_targets[feed_target_names_[i]] = &feeds[i];
   }
   // get fetch variable
@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   // Run the inference program
   // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(
       ctx_.get(),
       sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
       &feed_targets,
       &fetch_targets,
       false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
   if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetchs";
+    LOG(ERROR) << "fail to get fetches";
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index 86d1db7bcc7567e104cd20c9f767ed4513f611f5..ba266b608da342fb71faf05d02ddf74330e21e98 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
 
   ~NativePaddlePredictor() override;
 
- private:
+ protected:
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                std::vector<framework::LoDTensor> *feeds);
   bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a11396cee91a758e86af2efd9e58b9da68442590
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+
+    executor_.reset(new paddle::framework::Executor(place_));
+
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+ private:
+  TensorRTConfig config_;
+};
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b100630dbe412ca811f1a8f2b8191356f5ebec2f
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 5b8dfc57ba020cea259041f55a66472ea26b4eec..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  memory::format in_format =
-      in_tz.size() == 2 ? memory::format::nc : in.format();
-  memory::format out_format =
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,7 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::data_undef;
 }
+
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index b8fcc92697ca1bf1d971f8fef020f31d405605a9..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,17 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor* from, Tensor* to) {
+static void PassTensorData(Tensor *from, Tensor *to) {
   to->ShareDataWith(*from);
   *from = Tensor();
 }
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *output_tensor) {
   bool transformed = false;
   Tensor in;
   in.ShareDataWith(input_tensor);
@@ -47,9 +51,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
+
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
+
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
@@ -85,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   output_tensor->ShareDataWith(in);
 }
 
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var) {
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
+    auto &in_lod_tensor = in_var.Get<LoDTensor>();
+    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
+    auto &in_selected_rows = in_var.Get<SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index dee5d8c7c1126013742460df1d94bb364220ad09..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,12 +30,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var);
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);
+
+/**
+ * Set OutVar from InVar, except the tensor is shared with `tensor`
+ */
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index a6fe64fa80d6bf036893d49de56d7274d49a3b30..cc7b94d0653e34c8ac711a7db7ab6ab1a9ac46a2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -207,53 +207,56 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       is_forwarding = false;
     } else {
       int op_dev_id = GetOpDeviceID(*op);
-      if (op_dev_id == -1) {  // var on all device
-        CreateComputationalOps(&result, *op, places_.size());
-      } else {
+      if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, *op, op_dev_id);
         for (auto &var_name : op->OutputArgumentNames()) {
           var_name_on_devices_.emplace(var_name, op_dev_id);
         }
-      }
-      if (!is_forwarding && places_.size() > 1) {
-        // Currently, we assume that once gradient is generated, it can be
-        // broadcast, and each gradient is only broadcast once.
-        if (static_cast<bool>(boost::get<int>(op->GetAttr(
-                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                              static_cast<int>(OpRole::kBackward))) {
-          try {
-            auto backward_vars =
-                boost::get<std::vector<std::string>>(op->GetNullableAttr(
-                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-            for (size_t i = 0; i < backward_vars.size(); i += 2) {
-              auto &p_name = backward_vars[i];
-              auto &g_name = backward_vars[i + 1];
-              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-              switch (strategy_.reduce_) {
-                case BuildStrategy::ReduceStrategy::kReduce:
-                  cur_device_id = GetAppropriateDeviceID({g_name});
-                  CreateReduceOp(&result, g_name, cur_device_id);
-                  var_name_on_devices_.emplace(g_name, cur_device_id);
-                  bcast_var_name_set[cur_device_id].emplace(p_name);
-                  break;
-                case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(g_name)) {
-                    CreateReduceOp(&result, g_name, 0);
-                    CreateBroadcastOp(&result, g_name, 0);
-                  } else {
-                    InsertAllReduceOp(&result, g_name);
-                  }
-                  break;
-                default:
-                  LOG(FATAL) << "Unknown reduce strategy ";
-                  break;
+      } else {
+        // This op runs on all devices, and its output may have parameter's
+        // gradients.
+        CreateComputationalOps(&result, *op, places_.size());
+
+        if (!is_forwarding && places_.size() > 1) {
+          // Currently, we assume that once gradient is generated, it can be
+          // broadcast, and each gradient is only broadcast once.
+          if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward))) {
+            try {
+              auto backward_vars =
+                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+              for (size_t i = 0; i < backward_vars.size(); i += 2) {
+                auto &p_name = backward_vars[i];
+                auto &g_name = backward_vars[i + 1];
+                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+                switch (strategy_.reduce_) {
+                  case BuildStrategy::ReduceStrategy::kReduce:
+                    cur_device_id = GetAppropriateDeviceID({g_name});
+                    CreateReduceOp(&result, g_name, cur_device_id);
+                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                    break;
+                  case BuildStrategy::ReduceStrategy::kAllReduce:
+                    if (IsSparseGradient(g_name)) {
+                      CreateReduceOp(&result, g_name, 0);
+                      CreateBroadcastOp(&result, g_name, 0);
+                    } else {
+                      InsertAllReduceOp(&result, g_name);
+                    }
+                    break;
+                  default:
+                    LOG(FATAL) << "Unknown reduce strategy ";
+                    break;
+                }
               }
+            } catch (boost::bad_get e) {
             }
-          } catch (boost::bad_get e) {
           }
         }
       }
@@ -470,7 +473,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                 const OpDesc &op) const {
   int op_dev_id = -1;
-  if (op.Type() == "split_byref") {
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
@@ -483,6 +486,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
     }
   } else if (op.Type() == "concat") {
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
   } else {
     PADDLE_ENFORCE(
         "the distribute training related op should be in [split_byref, "
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 9eb23c46264f9036f009b0ae9aeeb34ec70c0e53..18612c3c1b62cf4c2ebdc221c301c59ec81c2da7 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,7 +30,7 @@ class SSAGraphBuilder {
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
+  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 304b221e7e4c414a0ab562a1b99836d3b7c02efb..331aa9d2b5864c470dbd5e29ef6faccffdcf781c 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -16,6 +16,8 @@
 
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
   bool IsValidGraph(const SSAGraph* graph) const;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index b4c90013789759d17646d95efdc81fc6a0a4f3e7..09b0333ef2cb43a306133aa5af98d37c11454d4d 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iosfwd>
+#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
  private:
   std::unique_ptr<SSAGraphPrinter> printer_;
   std::unique_ptr<SSAGraphBuilder> builder_;
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index d29d8ce1c561e45980d10c17c984ca2ed3b453f3..5373d769a4993bb378b30c3b23885c072b778e5c 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (t.type().hash_code() == typeid(float).hash_code()) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {  // NOLINT
       os << t.data<float>()[i] << " ";
     } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
       os << t.data<int64_t>()[i] << " ";
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index f51a184e7bae2283f335fe9462a77b9c5fb831a5..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
   return ret;
 }
 
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
          (l.data_type_ != r.data_type_) ||
          NeedTransformLayout(l.data_layout_, r.data_layout_);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 122ee1dab35b8c7d42392a983b5b15b7c1be7869..4586183d8d206d07f8dcdc000a5ce0bc65d847d5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
   OpKernelMap& kernels = kernels_iter->second;
 
   // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
@@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   //   Do selection
   // }
 
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  // do data transform
-  Scope& new_scope = scope.NewScope();
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  std::vector<std::string> inplace_vars;
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              inplace_vars.push_back(var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), trans_var);
-          }
-        }
-      }
-    }
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
+  kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
-    original_tensor->ShareDataWith(*transformed_tensor);
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
 
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
+    dev_ctx->Wait();
   }
 
   if (FLAGS_check_nan_inf) {
     for (auto& vname : OutputVars(true)) {
-      auto* var = new_scope.FindVar(vname);
+      auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
@@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+void OperatorWithKernel::TransferInplaceVarsBack(
+    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& transfer_scope) const {
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor =
+        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+}
+
+Scope* OperatorWithKernel::TryTransferData(
+    const Scope& scope, const OpKernelType& expected_kernel_key,
+    std::vector<std::string>* transfered_inplace_vars) const {
+  Scope* new_scope = nullptr;
+  for (auto& var_name_item : Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      // Only tensor can be tranfer to another device.
+      if (var == nullptr || !VarIsTensor(var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetTensorFromVar(var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto kernel_type_for_var = GetKernelTypeForVar(
+          var_name_item.first, *tensor_in, expected_kernel_key);
+
+      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+        continue;
+      }
+
+      auto out_var_names = OutputVars(true);
+      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
+          out_var_names.end()) {
+        transfered_inplace_vars->emplace_back(var_name);
+      }
+
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
+
+      if (new_scope == nullptr) {
+        new_scope = &scope.NewScope();
+      }
+
+      auto* trans_var = new_scope->Var(var_name);
+      Tensor out;
+      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+
+  return new_scope;
+}
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39..1550d5df172f0599e1b42e7f1ccf51ac4dd1e0c3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+  /**
+   * Transfer data from scope to a transfered scope. If there is no data need to
+   * be tranfered, it returns nullptr.
+   *
+   * * transfered_inplace_vars is a output vector.
+   */
+  Scope* TryTransferData(
+      const Scope& scope, const OpKernelType& expected_kernel_key,
+      std::vector<std::string>* transfered_inplace_vars) const;
+
+  void TransferInplaceVarsBack(const Scope& scope,
+                               const std::vector<std::string>& inplace_vars,
+                               const Scope& exec_scope) const;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a6788cb6d5d6acb57998fb9b06dfaaf417912dde..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToGPUs(
     const std::unordered_set<std::string> &vars) const {
-  // the the initialize bcast, all vars would be bcast from device(0), otherwise
+  // the the initializing bcast, all vars would be bcast from device(0),
+  // otherwise
   // bcast from the specified device.
-  bool initialize = builder_.get() == nullptr ? true : false;
+  bool initializing = builder_.get() == nullptr ? true : false;
 
   for (auto &var : vars) {
     int var_dev_id =
         builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
-    if (!initialize && var_dev_id == -1) continue;
+    if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
-    if (initialize) {
+    if (initializing) {
       main_var = member_->local_scopes_[0]->FindVar(var);
     } else {
       main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs(
         auto place = member_->places_[i];
         void *buffer;
 
-        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id)) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
@@ -181,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
+          if (initializing) {
+            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                         nccl_ctx.comm_, nccl_ctx.stream());
+          } else {
+            if (var_dev_id >= 0) {
+              platform::dynload::ncclBcast(buffers[i], numel, data_type,
+                                           var_dev_id, nccl_ctx.comm_,
+                                           nccl_ctx.stream());
+            }
+          }
         }
         member_->nccl_ctxs_->WaitAll();
       }
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index ec16a1c600a3bafc1c4cbbd920360253c106e3a1..7071eea19c355c04711a11c224985be96c6589f4 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -28,9 +28,10 @@ endif()
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-  add_subdirectory(analysis)
 endif()
 
+add_subdirectory(analysis)
+
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 2bb2c8135d8c317388e1a0d711589a390c7e8924..cdd67fdc929851979fe0a38afe1af74ec7321b8a 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,26 +1,30 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
-  tensorrt_subgraph_pass.cc
   dfg_graphviz_draw_pass.cc
-  DEPS framework_proto)
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
 function (inference_analysis_test TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    cc_test(${TARGET}
-            SRCS "${analysis_test_SRCS}"
-            DEPS analysis
-            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
-    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+        cc_test(${TARGET}
+                SRCS "${analysis_test_SRCS}"
+                DEPS analysis
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
@@ -28,5 +32,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c5de189cd1eab1ba3de0b2cdfd2294d139ceab2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4625f008c15300b88ef0bce71cd7d8aa473c9a8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9e14fb1947da059c8d126d3da182ce446f6421e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+
+  void Run(Argument* argument);
+
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c1a72932a39f878add2bb884e280b91d3c38c0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index f7f4e03968a723df1718bd3752bdd1c3430d02be..6d316f20bff7a68754b0afec6463bd5d7579227f 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -41,6 +41,9 @@ struct Argument {
 
   // The original program desc.
   std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index c30a7c26cecbe67f0ca73223e06b2095584aca94..d09bf3ed161703b0cf273522921e157c7360a0bc 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-// It is a better idea that the inputs and outputs of this graph is set manully
+// It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
       outputs.push_back(out);
     }
   }
+
+  Clean();
+}
+
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
 }
 
 std::string DataFlowGraph::DotString() const {
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 913e344d371ddf3ea05a53c216e5b3bea8f11c7b..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -47,6 +47,10 @@ struct DataFlowGraph {
 
   // Output a DOT graph file for debug.
   std::string DotString() const;
+
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
 };
 
 /*
@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-std::pair<
-    std::vector<Node *>,
-    std::vector<
-        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
-                                                            &graph) {
+static std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
-      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
         inputs.insert(in);
       }
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index f7d4cca2132d11eb89eee5a71ed0a3cc7381e1ff..29ca008123addf07959b965a4b54bf55b18c401d 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,21 +13,35 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+using framework::proto::ProgramDesc;
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
+
 bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  desc_ = argument->origin_program_desc.get();
-  // Here some logic from program_desc.cc and will not add new interfaces into
-  // framework::ProgramDesc class, use some UT to assure the correctness.
-  auto* block = desc_->mutable_blocks()->Add();
-  block->set_idx(framework::kRootBlockIndex);
-  block->set_parent_idx(framework::kNoneBlockIndex);
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
+  // from the original program desc. The operators of the main block(the first
+  // block) should rewritten by data flow graph.
+  argument->transformed_program_desc.reset(
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
   return true;
 }
 
@@ -37,14 +51,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
   auto traits = GraphTraits<DataFlowGraph>(graph);
   for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
     if (it->deleted()) continue;
+
     switch (it->type()) {
-      case Node::Type::kFunction:
-        LOG(INFO) << "add function " << it->name();
+      case Node::Type::kFunction: {
+        LOG(INFO) << "add function " << it->repr();
         AddFluidOp(&(*it));
-        break;
-      case Node::Type::kFunctionBlock:
+      } break;
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
         AddEngineOp(&(*it));
-        break;
+      } break;
       default:
         continue;
     }
@@ -52,12 +69,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
 }
 
 void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  LOG(INFO) << "processing func " << node->name();
   auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
   // currently only the main block is analyzed.
   auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto* op = main_block->add_ops();
-  LOG(INFO) << "to copy the op";
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
                   // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
@@ -65,11 +80,90 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
   // NOTE It might be changed by other passes in the long run.
 }
 
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
+
 void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
-  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
-  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  // auto* op = main_block->add_ops();
   // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+
+  bool Finalize() override { return true; }
+};
+}  // namespace
+
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index cbb05f622cc29c99c57e649b1c57cf3e54541191..edc84b02ed20991e3e7c6c437d2b1fac169bae03 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
@@ -40,10 +41,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override {
-    return nullptr;
-  }
+  Pass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
@@ -53,6 +51,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 
  private:
   framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
 };
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index afffb3feb0c515faa554d0d4919c442ca4515294..a6f85484756417e103cbb60bcb664e8b800b9f28 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -18,12 +18,19 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+int DFG_GraphvizDrawPass::counter_{0};
+
 void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
   auto content = Draw(graph);
-  std::ofstream file(GenDotPath());
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
   file.write(content.c_str(), content.size());
   file.close();
-  LOG(INFO) << "draw dot to " << GenDotPath();
+
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
 std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
     if (!config_.display_deleted_node && node.deleted()) continue;
     for (auto &in : node.inlinks) {
       if (!config_.display_deleted_node && in->deleted()) continue;
-      for (auto &in : node.inlinks) {
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
+      dot.AddEdge(in->repr(), node.repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 93ebff59ae9691394858f32c822a5e70f3345581..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -46,24 +46,29 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
     const bool display_deleted_node;
   };
 
-  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return Pass::Finalize(); }
+  bool Finalize() override { return true; }
 
   std::string repr() const override { return "DFG graphviz drawer"; }
   std::string description() const override {
     return "Debug a DFG by draw with graphviz";
   }
 
- private:
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
+
   // Path of the dot file to output.
   std::string GenDotPath() const {
-    return config_.dir + "/" + "graph_" + config_.id + ".dot";
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
   }
 
-  std::string Draw(DataFlowGraph *graph);
+  virtual std::string Draw(DataFlowGraph *graph);
 
   Config config_;
 };
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index f4b5c5fd2201cc9ff56d7ee8d8921376c2c9c59e..162455b9c4e06b7fbb4bdede30444faf6a8a1509 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
   pass.Run(&dfg);
 
   // test content
-  std::ifstream file("./graph_test.dot");
+  std::ifstream file("./0-graph_test.dot");
   ASSERT_TRUE(file.is_open());
 
   std::string line;
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 112);
+  ASSERT_EQ(no, 82);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5f62eef52876ac68dfab00348f422a46de123cfe..e918622d74cfb11d83090555be2a768cc14e7742 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
 namespace paddle {
@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   return true;
 }
 
-bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
 
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     auto *v = graph->nodes.Create(Node::Type::kValue);
     v->SetName(var.name());
     v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
   for (int i = 0; i < main_block.ops_size(); i++) {
@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     // Link to the original protobuf message's memory, make it easier to
     // generate from a data flow graph to fluid ProgramDesc.
     o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
     // set inputs and outputs
     // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
@@ -79,9 +84,20 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   graph->Build();
 }
 
-Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
-    std::ostream &os, const std::string &banner) const {
-  return nullptr;
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index 176faf0220cc98bf2c0384af75125d4bc493e753..da8463b63bd0bb1633bfcb9d7d41a884ddd632c7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override;
+  Pass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca40c01fc57dbcc2ca16770a1b7d798de8b5625b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f0039e113159fdcc0cc1c209a8bc899bc82984c1..fff1621d3f1bb31cfa04110d1f3cf5dbfe927331 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdio>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -26,6 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
+
 template <typename Vec>
 int AccuDims(Vec &&vec, int size) {
   int res = 1;
@@ -93,7 +99,7 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
     dic_[name] = data_.size();
     data_.emplace_back(std::unique_ptr<T>(x));
     return data_.back().get();
@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index 3339b5044df0cf91d00aa9ddad310d4bf263bc3c..d9d265d225bb77a3f5f83cbd0b8b1c670fb34a31 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,6 +20,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_hash_ = typeid(std::string).hash_code();
+  }
+  PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
+  return data_;
+}
+
+std::string &NodeAttr::String() { return As<std::string>(); }
+
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 8c2e6d88b9605d9923d002f73b60cd92b5e551b7..8ecd1ae730e6ec6775f4a22fdc5dec0e8ca8e2d1 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -35,6 +35,44 @@ namespace analysis {
 
 class NodeMap;
 
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_hash_ = typeid(T).hash_code();
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_hash_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+
+ private:
+  std::string data_;
+  size_t type_hash_{std::numeric_limits<size_t>::max()};
+};
+
 /*
  * Node Representation.
  *
@@ -50,8 +88,6 @@ class Node {
 
   Node() = default;
 
-  struct Attr;
-
   // Cast to a subclass type, Function for example.
   template <typename Subclass>
   Subclass &As() {
@@ -71,7 +107,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) const { return attrs_[name]; }
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -80,6 +116,9 @@ class Node {
   void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
   void *pb_desc() const { return attr("pb_desc").Pointer(); }
 
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
+
   void SetDeleted() { deleted_ = true; }
   bool deleted() const { return deleted_; }
 
@@ -94,43 +133,6 @@ class Node {
   // Output links.
   std::vector<Node *> outlinks;
 
-  // A helper class to maintain the status from Pass.
-  struct Attr {
-    // NOTE T should be a primary type or a struct combined by several primary
-    // types.
-    // NOTE the STL containers should not use here.
-    // Some usages
-    //   Attr attr;
-    //   attr.Bool() = true;
-
-    bool &Bool() { return As<bool>(); }
-    float &Float() { return As<float>(); }
-    int32_t &Int32() { return As<int32_t>(); }
-    int64_t &Int64() { return As<int64_t>(); }
-    void *&Pointer() { return As<void *>(); }
-
-   private:
-    template <typename T>
-    T &As() {
-      // init storage in the first usage.
-      if (data_.empty()) {
-        VLOG(4) << "resize data to " << sizeof(T);
-        type_hash_ = typeid(T).hash_code();
-        data_.resize(sizeof(T));
-      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
-                     "type not matched, origin is %s, want %s",
-                     DataTypeNamer::Global().repr(type_hash_),
-                     DataTypeNamer::Global().repr<T>());
-      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-      return *reinterpret_cast<T *>(&data_[0]);
-    }
-
-   private:
-    std::string data_;
-    size_t type_hash_{std::numeric_limits<size_t>::max()};
-  };
-
   // Type checks.
   bool IsFunction() const { return type_ == Node::Type::kFunction; }
   bool IsValue() const { return type_ == Node::Type::kValue; }
@@ -150,7 +152,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, Attr> attrs_;
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
 };
 
 class Function;
@@ -213,6 +215,10 @@ class Function : public Node {
 struct FunctionBlock : public Node {
   std::string repr() const override { return "block-" + std::to_string(id()); }
   std::vector<Node *> subgraph;
+
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
 };
 
 class NodeMap {
@@ -227,7 +233,7 @@ class NodeMap {
 
   void Delete(size_t id);
 
-  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
 
   size_t size() const { return nodes_.size(); }
 
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/node_attr_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f70e5419a66969e8fb20152a8a8ace39316f57
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 65632b749177add9dcb297bffad1e85f68a80b02..25c566ebfa41abe3a247bc6c6e5583c8620a6abb 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -60,6 +60,9 @@ class Pass {
     return nullptr;
   }
 
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b17c0e0d724ebeea7b84bf63024cd141891a78b4..b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -19,6 +19,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
   for (auto& pass : data_) {
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 7841c4b9d08001264af9f3a248a96814d1c273c4..81a17e0287a5aef8a328e43380ee3691f5a32379 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
   // globally shared, so pass them as the arguemnts for all the pass managers.
   virtual bool Initialize(const Argument& argument) { return false; }
 
-  virtual bool Initialize(Argument* argument) {
-    argument_ = argument;
-    for (auto& pass : data_) {
-      LOG(INFO) << "Initializing pass " << pass->repr();
-      if (!pass->Initialize(argument)) {
-        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
+  virtual bool Initialize(Argument* argument);
 
   // Call all the passes' Finalize methods.
   virtual bool Finalize() {
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 7af6a199514636224f0b8303abea7d398400d278..dac1c509d728114bd24a2ea1150c407646026fd4 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 43ccac96c84e987ad1f494af3e314c810fc1ffe3..389f9e1a9148a4daf0e5b751cce5cb6325252a4e 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
       node->SetDeleted();
+      block_node->subgraph.push_back(node);
     }
 
-    std::unordered_map<Node *, Node *>
-        delelte_node_map;  // deleted node to BlockNode
-    for (auto *n : block_node->inlinks) {
-      n->inlinks.clear();
-    }
-    for (auto *n : block_node->outlinks) {
-      n->outlinks.clear();
-    }
-    for (auto *n : block_node->inlinks) {
-      n->outlinks.push_back(block_node);
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
+    // outlink to this sub-graph node.
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
+      for (auto *&n : nodes) {
+        if (subgraph_uniq.count(n)) {
+          n = block_node;
+        }
+      }
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
     }
-    for (auto *n : block_node->outlinks) {
-      n->inlinks.push_back(n);
+    for (auto *&o : block_node->outlinks) {
+      inlink_or_outlink_cleaner(o->inlinks);
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f736e385c11add152dc9ab9485bf1de40f80b2f3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+
+  bool Finalize() override { return true; }
+
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c558a6ebbde371071c7330a14cc986bf764d1773
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
+      : teller_(teller) {}
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const override {
+    return "tensorrt sub-graph mark pass";
+  }
+
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+
+ private:
+  teller_t teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c15e848b99ca318f4583e3d4b88345fe8e5ebc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+
+  LOG(INFO) << counter << " nodes marked";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index c7f40d43c922a328febd343cea7240fcb09f3d02..9993de22800bc0aafdcbf46618e6b479ac1eb187 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
     : node_inside_subgraph_teller_(teller) {}
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_);
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 79e9e2bcc9e626a102dfdab6f1f50c8d58f9bbdd..c6741a92095d33d261a4e1667c87a8ca02e51a9f 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // Tell whether to transform a sub-graph into TensorRT.
   using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
 
-  TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
 
   bool Initialize(Argument* argument) override { return true; }
 
@@ -38,10 +39,15 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // sub-graph into TensorRT.
   void Run(DataFlowGraph* graph) override;
 
+  bool Finalize() override { return true; }
+
+  std::string repr() const override { return "tensorrt-sub-graph"; }
+  std::string description() const override { return "tensorrt sub graph pass"; }
+
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
 };
 
 }  // namespace analysis
 }  // namespace inference
-}  // paddle
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index d12dcf0d0fe7f9354f7ed1aac924aeab3403e9b8..1d749d3fa3f39b351ccee6ebeb82467f7220a0b6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -23,49 +23,48 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-DEFINE_string(model_dir, "", "inference test model dir");
+DEFINE_string(dot_dir, "./", "");
 
-TEST(TensorRTSubGraph, single_pass) {
-  auto desc = LoadProgramDesc();
-  auto dfg = ProgramDescToDFG(desc);
-
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+TEST_F(DFG_Tester, tensorrt_single_pass) {
+  std::unordered_set<std::string> teller_set(
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
     if (node->type() != Node::Type::kFunction) return false;
     const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
+    if (teller_set.count(func->func_type())) return true;
     return false;
   };
 
-  DFG_GraphvizDrawPass::Config config{"./", "test"};
-  DFG_GraphvizDrawPass dfg_pass(config);
-  dfg_pass.Initialize();
-
-  DFG_GraphvizDrawPass dfg_pass1(config);
-  dfg_pass1.Initialize();
-
-  dfg_pass.Run(&dfg);
+  LOG(INFO) << "init";
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
-  trt_pass.Initialize();
 
-  trt_pass.Run(&dfg);
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
 
-  dfg_pass1.Run(&dfg);
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
 
   // Check the TRT op's block desc
-  for (auto node : dfg.nodes.nodes()) {
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
     if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
     }
   }
 }
 
-TEST(TensorRTSubGraph, pass_manager) {}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index cbba8b9d559e024fc1e955489bb8d37c77097d25..03b0b6946339772ac535b3471d50fbd74554239d 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/operators/math/blas.h"
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_service.h>
 #include <omp.h>
 #endif
 
@@ -164,7 +164,7 @@ TEST(inference, nlp) {
   // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
-  mkl_set_num_threads(1);
+  paddle::operators::math::SetNumThreads(1);
 #endif
 
   double start_ms = 0, stop_ms = 0;
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -41,6 +47,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
   VLOG(10) << "  pointer=" << p;
   return p;
 }
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
     LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                  << " bytes in CUDAPinnedPlace";
   }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index d3988ae16d7d4ceccaf01503c6200066f2fa4073..9dc39ad0ddf8c5de3e1960a1171431e026de35ae 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -195,7 +195,7 @@ if(WITH_DISTRIBUTE)
     endif()
 
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
+    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
         op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
         set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     endforeach()
@@ -216,7 +216,7 @@ if(WITH_DISTRIBUTE)
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
     endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS}  prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 6ee73c3000fb45b4e1cd5bbb730da7d61b494b6f..5d670fe3b9d99a31a628ff707ff860564eca952e 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                       "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
         "Param and Moment1 input of AdamOp should have same dimension");
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index f82ff47b52490c354f383515d430d14e24cbf6af..a7a28b02b67f2ef180ec0e273dbe7ef555f88ce2 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
       // merge duplicated rows if any.
       scatter::MergeAdd<DeviceContext, T> merge_func;
       auto grad_merge =
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index fc64e51b34d14d1a7b6e17a0f2d4f13b15a69d4b..7d5199aae7da4eed5afa6b8bd64c04a540b915d4 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -26,14 +26,15 @@ namespace operators {
 using Tensor = framework::Tensor;
 using platform::PADDLE_CUDA_NUM_THREADS;
 
+const int kMaxRank = 9;  // The max rank of a tensor allowed in Fluid
+
 __global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
                                  int axis, int64_t n, int64_t* trg_idx,
                                  int64_t* med_ids) {
   int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
   if (index < n) {
-    const int max_rank = 9;  // Max rank of a tensor allow in Fluid
-    int64_t shape_out_axis[max_rank - 1] = {0};
-    int64_t dims_out_axis[max_rank - 1] = {0};
+    int64_t shape_out_axis[kMaxRank - 1] = {0};
+    int64_t dims_out_axis[kMaxRank - 1] = {0};
     int64_t tmp = index;
     int64_t pos_in_axis = 0;
     int64_t i = dims_size - 2;
@@ -125,10 +126,8 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
     Tensor trg_idx_t;
     int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
-    int num_threads = PADDLE_CUDA_NUM_THREADS;
+    auto stream = ctx.cuda_device_context().stream();
+    const int num_threads = PADDLE_CUDA_NUM_THREADS;
 
     ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
         in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index 4ad6f3443db33fd14b67091d14fd877b951730ff..a757916be7f6ece9b783d51d1051aac6a276795b 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 25864e95d7e290c7f684501893e99c828c511979..f389eab605e087c535b9918264e6502217062505 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,28 +19,28 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  *num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t num_updates,
+    int64_t num_accumulates, int64_t old_num_accumulates) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
-  out_num_updates->data<int64_t>()[0] = num_updates_;
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+Accumulate the sum of parameter within sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 07ac5ced11605f6d0d5164d1c0f69acbd7bbed60..3958d3f685470f2505abf0e8bfd269d3834970ae 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    min_average_window =
-        std::min<int64_t>(min_average_window, max_average_window);
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index cc158e57f7140c84f02bc7e091d8eac0d2b672e1..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -111,11 +112,15 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     unsigned flags = mkldnn::use_scale_shift;
     if (is_test) flags |= mkldnn::use_global_stats;
+    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -249,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 52b0bf85c07fee380f9e7ba1c703b56367628644..693bf973c2b8790d2c50cee9b86b365493e8c754 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("fuse_with_relu",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index c3dd22119ddab8ecf9213ee274e4cbd4f05e78fd..10d678111f5325e495b24286e6ecf651230393fe 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <algorithm>
 #include <string>
+
+#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -22,8 +24,11 @@ namespace operators {
 struct BeamSearchDecodeFunctor {
   BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                           const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_origin_(step_ids),
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_origin_(step_ids),
         step_scores_origin_(step_scores),
         id_tensor_(id_tensor),
         score_tensor_(score_tensor) {
@@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_id : step_ids_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_id.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_id.lod());
         step_ids_.push_back(out);
@@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_score : step_scores_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_score.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
+                                &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_score.lod());
         step_scores_.push_back(out);
@@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor {
   void operator()() const;
 
   bool tensor_on_gpu_;
+  size_t beam_size_;
+  int end_id_;
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
@@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor {
 
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
-    beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                     score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
+                                  score_tensor_);
   } else {
-    beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
-                                     id_tensor_, score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
+                                  id_tensor_, score_tensor_);
   }
 }
 
@@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                         "Level of LodTensor should be 2");
     }
 
+    size_t beam_size = ctx.Attr<int>("beam_size");
+    int end_id = ctx.Attr<int>("end_id");
+
     // prepare output
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
         framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
+                                beam_size, end_id));
   }
 };
 
@@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Ids",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected ids of all steps");
     AddInput("Scores",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddOutput("SentenceIds",
-              "(LodTensor)"
-              "All possible result sentences of word ids");
-    AddOutput("SentenceScores",
-              "(LodTensor)"
-              "All possible result sentences of word scores");
+             "The LodTensorArray containing the selected scores of all steps");
+    AddOutput(
+        "SentenceIds",
+        "(LodTensor)"
+        "An LodTensor containing all generated id sequences for all source "
+        "sentences");
+    AddOutput(
+        "SentenceScores",
+        "(LodTensor)"
+        "An LodTensor containing scores corresponding to Output(SentenceIds)");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
     AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
+Beam Search Decode Operator. This Operator constructs the full hypotheses for
+each source sentence by walking back along the LoDTensorArray Input(ids)
+whose lods can be used to restore the path in the beam search tree.
+
+The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+generated id sequences and the corresponding scores. The shapes and lods of the 
+two LodTensor are same. The lod level is 2 and the two levels separately 
+indicate how many hypotheses each source sentence has and how many ids each 
+hypothesis has.
 )DOC");
   }
 };
@@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
+      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3c01f81c83555b985bb6b7a9e3330ab594a62863..6aefc5446f167eebb0da673b3fbdf7ed128daa98 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor;
 using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
-// The First is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
-// sentence level describe these candidates belong to which prefix
+// The first is source level, the second is sentence level.
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
 
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
-
 template <typename T>
 struct Sentence {
   std::vector<int64_t> word_ids;
@@ -72,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>;
 
 template <typename T>
 struct BeamSearchDecoder {
-  /**
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>* prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
 
   /**
    * convert the result sentence_vector for each source sentence into two
@@ -100,107 +56,30 @@ struct BeamSearchDecoder {
    *  sentence_vector_list: sentence_vector for each source sentence.
    *  id_tensor: result LoDTensor for sentences of id.
    *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
    */
   void ConvertSentenceVectorToLodTensor(
       std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
+      LoDTensor* score_tensor, bool reverse = true,
+      bool sort_by_score = true) const;
 
   /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
-   * it's main logic is:
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
    */
-  void PackAllSteps(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
-};
-
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-
-  return sentence;
-}
-
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>* prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const;
 
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
-       ++src_idx) {
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-
-    BeamNodeVector<T> beam_nodes;
-
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list->empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
+  size_t beam_size_;
+  int end_id_;
+};
 
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
     std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
   size_t src_num = sentence_vector_list.size();
 
   PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
@@ -211,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   std::vector<T> score_data;
 
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
     for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                     sentence.word_ids.end());
-      score_data.insert(score_data.end(), sentence.scores.begin(),
-                        sentence.scores.end());
+      if (reverse) {
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
+                       sentence.word_ids.rend());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
+
       sentence_level_lod.push_back(sentence_level_lod.back() +
                                    sentence.word_ids.size());
     }
@@ -243,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 }
 
 template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
+                                     const LoDTensorArray& step_scores,
+                                     LoDTensor* id_tensor,
+                                     LoDTensor* score_tensor) const {
   PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
   PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
                     "step_ids and step_scores should be the same");
   const size_t step_num = step_ids.size();
   const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
 
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
-
-  // previous prefixes for each step,
-  // the init length is 0, means this is the first step.
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
-
-  // pack all steps for one batch first, then another batch
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     &beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
+            prefix_idx++;
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
+        }
+      }
     }
   }
 
   ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
+                                   score_tensor, true, true);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index 36f9594969c416c694928811012baf94332bbd91..88339e38d89db3f79abf232d6b0d035b759739a6 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorArray = paddle::framework::LoDTensorArray;
 
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
 template <typename T>
 using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
 template <typename T>
 using Sentence = paddle::operators::Sentence<T>;
 template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
 using SentenceVector = paddle::operators::SentenceVector<T>;
 
 namespace paddle {
@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
 }  // namespace test
 }  // namespace paddle
 
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-
-  delete b3;
-  delete b2;
-}
-
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-
-TEST(BeamSearchDecodeOp, PackAllSteps) {
+TEST(BeamSearchDecodeOp, Backtrace) {
   CPUPlace place;
 
-  // we will constuct a sample data with 3 steps and 2 source sentences
+  // Construct sample data with 5 steps and 2 source sentences
+  // beam_size = 2, start_id = 0, end_id = 1
   LoDTensorArray ids;
   LoDTensorArray scores;
 
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
+      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
+  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
+                                std::vector<size_t>{0, 2, 4},
+                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 2, 2, 4, 4},
+                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 1, 2, 3, 4},
+                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+      std::vector<size_t>{0, 2, 4},
+      std::vector<size_t>{0, 0, 0, 2,
+                          2},  // the branchs of the first source sentence
+                               // are pruned since finished
+      std::vector<int>{5, 1},
+      &ids, &scores);
 
-  ASSERT_EQ(ids.size(), 3UL);
-  ASSERT_EQ(scores.size(), 3UL);
+  ASSERT_EQ(ids.size(), 5UL);
+  ASSERT_EQ(scores.size(), 5UL);
 
-  BeamSearchDecoder<float> helper;
+  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
 
   LoDTensor id_tensor;
   LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
 
   LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  std::vector<size_t> expect_source_lod = {0, 2, 4};
   EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
   EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
+                                  5, 3, 5, 0, 4, 5, 3, 1};
   ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
   for (size_t i = 0; i < expect_data.size(); ++i) {
     ASSERT_EQ(id_tensor.data<int64_t>()[i],
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index df0b50881f4e3ec6f57bdb2b63033931059c486e..62771d09f112785ca1ba741a0ba239b1f0234633 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
-
 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            const framework::LoDTensor &pre_scores,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
   auto abs_lod = framework::ToAbsOffset(ids_->lod());
   auto &high_level = abs_lod[lod_level_];
 
-  auto items = SelectTopBeamSizeItems();
+  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
   VLOG(3) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
       VLOG(3) << ItemToString(item);
     }
   }
-  PruneEndidCandidates(pre_ids, &selected_items);
+
+  PruneEndBeams(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
       std::begin(selected_items), std::end(selected_items), 0,
@@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   size_t low_offset = 0;
   for (auto &items : selected_items) {
     low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
     for (auto &item : items) {
       ids_data[low_offset] = item.id;
       scores_data[low_offset] = item.score;
@@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   selected_scores->set_lod(lod);
 }
 
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
+void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
+                               std::vector<std::vector<Item>> *items) {
   auto *pre_ids_data = pre_ids.data<int64_t>();
-
-  int res = 0;
-  for (size_t offset = 0; offset < items->size(); offset++) {
-    auto prefix_id = pre_ids_data[offset];
-    if (prefix_id == end_id_) {
-      items->at(offset).clear();
-    } else {
-      res++;
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+    size_t src_prefix_start = high_level[src_idx];
+    size_t src_prefix_end = high_level[src_idx + 1];
+    bool finish_flag = true;
+    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
+      for (auto &item : items->at(offset)) {
+        if (item.id != static_cast<size_t>(end_id_) ||
+            pre_ids_data[offset] != end_id_) {
+          finish_flag = false;
+          break;
+        }
+      }
+      if (!finish_flag) break;
+    }
+    if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                        // prune this beam
+      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
+        items->at(offset).clear();
     }
   }
-
-  return res;
 }
 
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
@@ -115,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
   return result;
 }
 
-std::vector<std::vector<BeamSearch::Item>>
-BeamSearch::SelectTopBeamSizeItems() {
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
+    const framework::LoDTensor &pre_ids,
+    const framework::LoDTensor &pre_scores) {
   std::vector<std::vector<Item>> result;
   std::vector<Item> items;
   // for each source sentence, select the top beam_size items across all
   // candidate sets.
-  while (NextItemSet(&items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
-                     std::end(items), [](const Item &a, const Item &b) {
-                       // TODO(superjom) make score's comparation customizable.
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
+  while (NextItemSet(pre_ids, pre_scores, &items)) {
+    std::nth_element(
+        std::begin(items), std::begin(items) + beam_size_, std::end(items),
+        [](const Item &a, const Item &b) { return a.score > b.score; });
     // prune the top beam_size items.
     if (items.size() > beam_size_) {
       items.resize(beam_size_);
@@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() {
 }
 
 // the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
+                             const framework::LoDTensor &pre_scores,
+                             std::vector<BeamSearch::Item> *items) {
   if (sent_offset_ >= ids_->NumElements(lod_level_)) {
     return false;
   }
@@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
     instance_dim *= ids.dims()[i];
   }
 
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto *pre_scores_data = pre_scores.data<float>();
   items->clear();
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
-      const size_t dim_offset = offset * instance_dim + d;
-      items->emplace_back(offset, ids_data[dim_offset],
-                          scores_data[dim_offset]);
+    auto pre_id = pre_ids_data[offset];
+    auto pre_score = pre_scores_data[offset];
+    if (pre_id == end_id_) {
+      // Allocate all probability mass to eos_id for finished branchs and the
+      // other candidate ids can be ignored.
+      items->emplace_back(offset, end_id_, pre_score);
+    } else {
+      for (size_t d = 0; d < instance_dim; d++) {
+        const size_t dim_offset = offset * instance_dim + d;
+        items->emplace_back(offset, ids_data[dim_offset],
+                            scores_data[dim_offset]);
+      }
     }
   }
 
@@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("pre_ids",
+             "(LoDTensor) The LoDTensor containing the selected ids at the "
+             "previous step. It should be a tensor with shape (batch_size, 1) "
+             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
+             "thefirst step.");
+    AddInput("pre_scores",
+             "(LoDTensor) The LoDTensor containing the accumulated "
+             "scores corresponding to the selected ids at the previous step.");
+    AddInput("ids",
+             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
+             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "be beam_size.");
     AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
+             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "corresponding to Input(ids) and its shape is the same as the "
+             "shape of Input(ids).");
     AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
-    AddOutput(
-        "selected_scores",
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+              "A LodTensor that stores the IDs selected by beam search.");
+    AddOutput("selected_scores",
+              "A LoDTensor containing the accumulated scores corresponding to "
+              "Output(selected_ids).");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
@@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
 
-    AddComment(
-        "This is a beam search operator that help to generate sequences.");
+    AddComment(R"DOC(
+This operator does the search in beams for one time step. 
+Specifically, it selects the top-K candidate word ids of current step from
+Input(ids) according to their Input(scores) for all source sentences,
+where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
+from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
+are the output of beam_search at previous step, they are needed for special use
+to handle ended candidate translations. The paths linking prefixes and selected
+candidates are organized and reserved in lod.
+
+Note that the Input(scores) passed in should be accumulated scores, and
+length penalty should be done with extra operators before calculating the
+accumulated scores if needed, also suggest finding top-K before it and
+using the top-K candidates following.
+)DOC");
   }
 };
 
@@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
+      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 46bc4f6f936929050276e8b3b93f1eddd62ac638..b5e2ed05924cc8b7bc06058b9b1103ba10be486e 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -132,6 +132,7 @@ class BeamSearch {
    * that means no candidates is provided, and the task will stop running.
    */
   void operator()(const framework::LoDTensor& pre_ids,
+                  const framework::LoDTensor& pre_scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
   /*
@@ -153,14 +154,16 @@ class BeamSearch {
 
  protected:
   /*
-   * Delete all the records that follows the end token.
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
    */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
+  void PruneEndBeams(const framework::LoDTensor& pre_ids,
+                     std::vector<std::vector<Item>>* items);
 
   /*
    * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
+   * NOTE low performance.
    */
   std::vector<std::vector<Item>> ToMap(
       const std::vector<std::vector<Item>>& inputs, size_t element_num);
@@ -168,12 +171,16 @@ class BeamSearch {
   /*
    * For each source, select top beam_size records.
    */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor& pre_ids,
+      const framework::LoDTensor& pre_scores);
 
   /*
    * Get the items of next source sequence, return false if no remaining items.
    */
-  bool NextItemSet(std::vector<Item>* items);
+  bool NextItemSet(const framework::LoDTensor& pre_ids,
+                   const framework::LoDTensor& pre_scores,
+                   std::vector<Item>* items);
 
  private:
   size_t beam_size_;
@@ -192,24 +199,25 @@ template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids_var = context.Input<framework::LoDTensor>("ids");
-    auto* scores_var = context.Input<framework::LoDTensor>("scores");
-    auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+    auto* ids = context.Input<framework::LoDTensor>("ids");
+    auto* scores = context.Input<framework::LoDTensor>("scores");
+    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
+    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
+    PADDLE_ENFORCE_NOT_NULL(ids);
+    PADDLE_ENFORCE_NOT_NULL(scores);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids);
+    PADDLE_ENFORCE_NOT_NULL(pre_scores);
 
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
-    auto selected_ids_var =
-        context.Output<framework::LoDTensor>("selected_ids");
-    auto selected_scores_var =
+    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    alg(*pre_ids_var, selected_ids_var, selected_scores_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_ids);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index ec666359aa2bd81f1323b54f9a03235740c3a696..c4f4b478fbfc87e4178155132781214575c1e6b0 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,7 +30,7 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level0({0, 2, 4});
   vector<size_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
   for (int i = 0; i < 4; i++) {
     pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
   }
+  LoDTensor pre_scores;
+  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
 
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
   LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
+  beamsearch(pre_ids, pre_scores, &sids, &sscores);
 
   LOG(INFO) << "score: " << sscores << endl;
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids({2, 4, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  vector<int> tids({4, 2, 3, 8});
+  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4219a429a53eb4869426a2674109555fb784b85
--- /dev/null
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckpointNotifyOp : public framework::OperatorBase {
+ public:
+  CheckpointNotifyOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::string dir = Attr<std::string>("dir");
+    std::string lookup_table_name = Attr<std::string>("lookup_table");
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    for (size_t i = 0; i < epmap.size(); i++) {
+      auto lookup_table_save_dir =
+          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
+      rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
+      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
+              << " and dir:" << dir << " to " << epmap[i];
+    }
+    rpc_client->Wait();
+  }
+};
+
+class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default  127.0.0.1:6164)"
+                                      "Parameter Server endpoints in the order")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<std::string>(
+        "dir", "(string, default '') indicate the folder checkpoint will use");
+    AddAttr<std::string>("lookup_table",
+                         "(string, default '') the lookup table name");
+    AddComment(R"DOC(
+CheckpointNotify operator
+
+This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
+the parameter server.
+)DOC");
+  }
+};
+
+class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CheckpointNotifyOpMaker,
+                  ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 2e9e957ebdc2a5cb7663b968c5da631aebe60b1c..eeb98ee44f206dbfbe1f61689aa9843122ae3f92 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
+// conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
+// conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+// depthwise conv2d_transpose
+REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
index 640fa7d14a079debeceb54d8775c4ede7da1b536..a6d5665df83ae5c89d42840e91a6abd853fedd12 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -15,25 +15,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
+// conv2d
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// conv3d
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// depthwise conv2d
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
+                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 1dcfc651fdd79aed50736d05d38ec8576b183d41..0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 
@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1);
+    }
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
+                           output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+                    input_grad);
+    }
+
+    if (filter_grad) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index d437ad5c19828331c749244404ba80d0f3acda2a..c23b65fe4dead3ca01a447d03877e3359b19e656 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   }
 };
 
+template <class T>
+bool DistPairDescend(std::tuple<int, int, T> pair1,
+                     std::tuple<int, int, T> pair2) {
+  return std::get<2>(pair1) > std::get<2>(pair2);
+}
+
 template <typename T>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   // The match_dist must be initialized to 0 at first.
   void BipartiteMatch(const Tensor& dist, int* match_indices,
                       T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
     PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
     int64_t row = dist.dims()[0];
     int64_t col = dist.dims()[1];
     auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
-    for (int i = 0; i < row; ++i) {
-      row_pool.push_back(i);
-    }
-    while (row_pool.size() > 0) {
-      int max_idx = -1;
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
+    // Test result: When row==130 the speed of these two methods almost the same
+    if (row >= 130) {
+      std::vector<std::tuple<int, int, T>> match_pair;
+
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
         }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
-          int m = row_pool[k];
-          // distance is 0 between m-th row and j-th column
-          if (dist_data[m * col + j] < kEPS) {
+      }
+      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
+      std::vector<int> row_indices(row, -1);
+
+      int64_t idx = 0;
+      for (int64_t k = 0; k < row * col; ++k) {
+        int64_t i = std::get<0>(match_pair[k]);
+        int64_t j = std::get<1>(match_pair[k]);
+        T dist = std::get<2>(match_pair[k]);
+
+        if (idx >= row) {
+          break;
+        }
+        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
+          match_indices[j] = i;
+          row_indices[i] = j;
+          match_dist[j] = dist;
+          idx += 1;
+        }
+      }
+    } else {
+      constexpr T kEPS = static_cast<T>(1e-6);
+      std::vector<int> row_pool;
+      for (int i = 0; i < row; ++i) {
+        row_pool.push_back(i);
+      }
+      while (row_pool.size() > 0) {
+        int max_idx = -1;
+        int max_row_idx = -1;
+        T max_dist = -1;
+        for (int64_t j = 0; j < col; ++j) {
+          if (match_indices[j] != -1) {
             continue;
           }
-          if (dist_data[m * col + j] > max_dist) {
-            max_idx = j;
-            max_row_idx = m;
-            max_dist = dist_data[m * col + j];
+          for (size_t k = 0; k < row_pool.size(); ++k) {
+            int m = row_pool[k];
+            // distance is 0 between m-th row and j-th column
+            if (dist_data[m * col + j] < kEPS) {
+              continue;
+            }
+            if (dist_data[m * col + j] > max_dist) {
+              max_idx = j;
+              max_row_idx = m;
+              max_dist = dist_data[m * col + j];
+            }
           }
         }
-      }
-      if (max_idx == -1) {
-        // Cannot find good match.
-        break;
-      } else {
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        match_indices[max_idx] = max_row_idx;
-        match_dist[max_idx] = max_dist;
-        // Erase the row index.
-        row_pool.erase(
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        if (max_idx == -1) {
+          // Cannot find good match.
+          break;
+        } else {
+          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+          match_indices[max_idx] = max_row_idx;
+          match_dist[max_idx] = max_dist;
+          // Erase the row index.
+          row_pool.erase(
+              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
index 34f140687f91d866536f5e2b647c7445a6624736..8ff1f0a6076b3574c42065edcbac50eb75b3b483 100644
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -55,26 +55,24 @@ class BRPCClient : public RPCClient {
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                     const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool AsyncPrefetchVar(const std::string& ep,
                         const platform::DeviceContext& ctx,
                         const framework::Scope& scope,
                         const std::string& in_var_name,
                         const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
 
   void Wait() override;
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 52f931188dc790682626b14da83d0835cad4f1a6..8228a8c5a3eae73fe82551c8aad55290b0d54ef0 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -239,6 +239,23 @@ void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
+void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                       const std::string& dir,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return req_count_ == 0; });
@@ -269,14 +286,15 @@ void GRPCClient::Proceed() {
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
   std::lock_guard<std::mutex> guard(chan_mutex_);
   auto it = channels_.find(ep);
   if (it != channels_.end()) {
     return it->second;
   }
 
+  // Channel configurations:
   grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
   args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 7875939ff510e7e41a2a11ca965b52eedff3d05c..7a08f2d3a4a28a4323723e6b887c50588eed2bce 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -76,6 +76,7 @@ class BaseProcessor {
   virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
     context_.reset(new grpc::ClientContext());
     var_h_ = var_info;
+    context_->set_wait_for_ready(true);
 
     std::chrono::system_clock::time_point deadline =
         std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -85,6 +86,7 @@ class BaseProcessor {
 
   virtual void Prepare(int64_t time_out) {
     context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);
 
     std::chrono::system_clock::time_point deadline =
         std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -169,6 +171,20 @@ class FetchBarrierProcessor : public BaseProcessor {
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
+class CheckpointNotifyProcessor : public BaseProcessor {
+ public:
+  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~CheckpointNotifyProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
 class GRPCClient : public RPCClient {
  public:
   GRPCClient() {}
@@ -176,26 +192,27 @@ class GRPCClient : public RPCClient {
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                     const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool AsyncPrefetchVar(const std::string& ep,
                         const platform::DeviceContext& ctx,
                         const framework::Scope& scope,
                         const std::string& in_var_name,
                         const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
 
   void Wait() override;
 
@@ -211,7 +228,7 @@ class GRPCClient : public RPCClient {
   void Proceed();
 
   void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
+                         int64_t time_out = FLAGS_rpc_deadline);
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index b9a9b12cecdada570da5af173e394999554e9cb8..f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -97,7 +97,7 @@ class RequestSend final : public RequestBase {
 
   void Process() override {
     std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = request_->GetVar();
@@ -132,7 +132,7 @@ class RequestGet final : public RequestBase {
   void Process() override {
     // proc request.
     std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;
 
     auto scope = request_handler_->scope();
     auto invar = scope->FindVar(varname);
@@ -178,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
     // prefetch process...
     std::string in_var_name = request_->Varname();
     std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
             << " out_var_name: " << out_var_name;
 
     auto scope = request_->GetMutableLocalScope();
@@ -200,11 +200,50 @@ class RequestPrefetch final : public RequestBase {
   framework::Scope* local_scope_;
 };
 
+class RequestCheckpointNotify final : public RequestBase {
+ public:
+  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
+                                   ::grpc::ServerCompletionQueue* cq,
+                                   RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx()));
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestCheckpointNotify() {}
+
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    auto scope = request_->GetMutableLocalScope();
+
+    std::string checkpoint_notify = request_->Varname();
+    std::string checkpoint_dir = request_->OutVarname();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir;
+
+    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                             checkpoint_dir);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }
 
 void AsyncGRPCServer::StartServer() {
@@ -237,13 +276,14 @@ void AsyncGRPCServer::StartServer() {
     reqs.reserve(kRequestBufSize);
 
     for (int i = 0; i < kRequestBufSize; i++) {
+      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
       TryToRegisterNewOne(rpc_name, i);
     }
 
     for (int i = 0; i < threadnum; i++) {
       rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
           &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
     }
   }
 
@@ -260,7 +300,7 @@ void AsyncGRPCServer::StartServer() {
     auto& threads = t.second;
     for (size_t i = 0; i < threads.size(); ++i) {
       threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
     }
   }
 }
@@ -268,7 +308,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
   for (auto& t : rpc_cq_) {
     t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
   }
 }
 
@@ -277,7 +317,7 @@ void AsyncGRPCServer::ShutDownImpl() {
   is_shut_down_ = true;
   ShutdownQueue();
 
-  VLOG(3) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
   server_->Shutdown();
 }
 
@@ -285,12 +325,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                           int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
-    LOG(WARNING) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
 
-  VLOG(4) << "register send rpc_name:" << rpc_name
-          << ", handler:" << rpc_call_map_[kRequestSend];
+  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+          << " REQ ID: " << req_id;
 
   auto& reqs = rpc_reqs_[rpc_name];
   auto& handler = rpc_call_map_[rpc_name];
@@ -303,6 +343,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestGet(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestPrefetch) {
     b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestCheckpoint) {
+    b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id);
   } else {
     PADDLE_ENFORCE(false, "not supported rpc");
   }
@@ -321,7 +363,7 @@ void AsyncGRPCServer::HandleRequest(
   while (true) {
     VLOG(4) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
+      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
 
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index 141be3e68012743a32e4df5de148a55717f8e9a2..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -80,10 +80,11 @@ enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
   kPrefetchVariable,
+  kCheckpointNotify,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
+    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -93,6 +94,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/GetVariable";
     case GrpcMethod::kPrefetchVariable:
       return "/sendrecv.SendRecvService/PrefetchVariable";
+    case GrpcMethod::kCheckpointNotify:
+      return "/sendrecv.SendRecvService/CheckpointNotify";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index cf106656aa56c2130d8be8dbe7478c3397f9b9ad..90742a201ad46447d6fbbe2137aa40fabc2f9983 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -36,12 +36,16 @@ namespace distributed {
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
+constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
 
+#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
+#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
+
 class RPCServer;
 
 class RequestHandler {
@@ -69,6 +73,11 @@ class RequestHandler {
     prefetch_var_name_to_prepared_ctx_ = g;
   }
 
+  void SetCheckpointNotifyPreparedCtx(
+      std::shared_ptr<framework::ExecutorPrepareContext> g) {
+    checkpoint_prepared_ctx_ = g;
+  }
+
   // Used for async.
   void SetGradToPreparedCtx(
       std::unordered_map<
@@ -115,6 +124,8 @@ class RequestHandler {
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       prefetch_var_name_to_prepared_ctx_;
+  // used for checkpoint notify
+  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
 
   // Used for async.
   std::unordered_map<std::string,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index cb78c15c01e8e7f47ec759a75090f9a6b880b493..163154c678f65b08981041d647b11f4b2b5860ba 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -22,11 +22,16 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
 bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Scope* scope,
                                 framework::Variable* invar,
@@ -119,6 +124,24 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
   return true;
 }
 
+bool RequestCheckpointHandler::Handle(const std::string& varname,
+                                      framework::Scope* scope,
+                                      framework::Variable* invar,
+                                      framework::Variable** outvar,
+                                      const std::string& out_var_name) {
+  PADDLE_ENFORCE(
+      checkpoint_notify_id != -1,
+      "when checkpoint_notify_id = -1, there should be no RPC invoke.");
+
+  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  lt_var->clear();
+  lt_var->append(out_var_name);
+  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
+          << out_var_name;
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  return true;
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index abbe8778911a21ece3090bc9790d51a3cb31b6d7..87185500f2ffc3a8578eea339cc7a1e2b0e46631 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -66,6 +66,21 @@ class RequestPrefetchHandler final : public RequestHandler {
               const std::string& out_var_name = "") override;
 };
 
+class RequestCheckpointHandler final : public RequestHandler {
+ public:
+  explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id)
+      : RequestHandler(sync_mode) {
+    this->checkpoint_notify_id = checkpoint_notify_id;
+  }
+  virtual ~RequestCheckpointHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+
+ private:
+  int checkpoint_notify_id;
+};
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
index c71edf977c18e554c502732e9bf4bb4ea99f8f99..b5ec9fe5367beb97b3cc7298102deff1e8ca4ec9 100644
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 72fa6d940886bc676e9d03d13f12d07772f5f5a7..37783b78ecc5c58aab3e358066bd7f2fba861799 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,11 +15,14 @@
 #pragma once
 
 #include <string>
+#include "gflags/gflags.h"
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 
+DECLARE_int32(rpc_deadline);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -32,26 +35,30 @@ class RPCClient {
                             const platform::DeviceContext& ctx,
                             const framework::Scope& scope,
                             const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
+                            int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual bool AsyncGetVar(const std::string& ep,
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
+                           int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual bool AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
                                 const std::string& in_var_name,
                                 const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncCheckpointNotify(const std::string& ep,
+                                     const std::string& dir,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   // SendComplete tells all the server that current trainer have no more data
   // to train, so that the pserver can reduce it's barrier count, and continue
@@ -60,8 +67,6 @@ class RPCClient {
 
   virtual void Wait() = 0;
 
-  static constexpr int64_t rpc_time_out = 120 * 1000;
-
   template <typename T>
   static RPCClient* GetInstance() {
     std::call_once(init_flag_, &RPCClient::Init<T>);
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index fa0cb71b3056de92f65139c5402132fc8cbb7a87..c0520e248d49f4f390af9075fc6f87ec4bd74c39 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
     return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
   });
 
-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }
 
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
   int b = 0;
   std::unique_lock<std::mutex> lock(mutex_);
   b = ++barrier_counter_[rpc_name];
@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
   int cond = 0;
   {
     std::unique_lock<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
index 54cb93e04d18b3784be187c9c8885bbccc55488b..e0902320cff003797b12ed0204f7f99c44554b62 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto
@@ -25,6 +25,8 @@ service SendRecvService {
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
   // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
+
+  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f612256840825a75f49944ab97ff957d572a863
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 12364fff96c03c5f9dff23c7c00ceedd043803a6..bb88970e42c194d9437609b62435f1a89e2b446b 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator
 
@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index d98bf807a9464c1c2294aa0601386a940ddc00f8..56e39649b409f7eed108027f6df58c19dd3c8ab8 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -99,7 +99,8 @@ static int64_t GetTimestamp() {
 void ListenAndServOp::RunSyncLoop(
     framework::Executor *executor, framework::ProgramDesc *program,
     framework::Scope *recv_scope,
-    const std::vector<int> &prefetch_block_id_list) const {
+    const std::vector<int> &prefetch_block_id_list,
+    const int checkpoint_point_block_id) const {
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -163,8 +164,8 @@ void ListenAndServOp::RunSyncLoop(
 }
 
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::ProgramDesc *program,
+                                   framework::Scope *recv_scope) const {
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
@@ -191,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     block_list.push_back(blkid);
   }
   auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>
       grad_to_prepared_ctx;
@@ -202,10 +207,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
-  VLOG(3) << "RunAsyncLoop into while";
   while (true) {
     if (rpc_service_->IsExit()) {
-      LOG(INFO) << "get exit!rpc_processor break!";
+      VLOG(4) << "get exit!rpc_processor break!";
       break;
     }
 
@@ -220,6 +224,7 @@ static void FillRequestCtx(
     std::unordered_map<std::string,
                        std::shared_ptr<framework::ExecutorPrepareContext>>
         *prefetch_ctx,
+    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
     distributed::RPCServer *rpc_server) {
   h->SetScope(scope);
   h->SetDevCtx(dev_ctx);
@@ -227,6 +232,7 @@ static void FillRequestCtx(
   h->SetProgram(program);
   h->SetPrefetchPreparedCtx(prefetch_ctx);
   h->SetRPCServer(rpc_server);
+  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
 
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
@@ -242,9 +248,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
+  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
 
-  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-            << ", end_point:" << endpoint;
+  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+          << ", end_point:" << endpoint
+          << ", checkpoint_block_id: " << checkpoint_block_id;
 
   rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
 
@@ -252,6 +260,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
   request_prefetch_handler_.reset(
       new distributed::RequestPrefetchHandler(sync_mode));
+  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
+      sync_mode, checkpoint_block_id));
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get());
@@ -259,6 +269,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             request_get_handler_.get());
   rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                             request_prefetch_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
+                            request_checkpoint_handler_.get());
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -267,6 +279,13 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   auto *program = optimize_blocks[0]->Program();
   framework::Executor executor(dev_place);
 
+  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
+  if (checkpoint_block_id != -1) {
+    auto ctx = executor.Prepare(*program, checkpoint_block_id);
+    // see: https://stackoverflow.com/a/14856553
+    ckpt_pre_context = std::move(ctx);
+  }
+
   // prepare for prefetch
   std::vector<int> prefetch_block_id_list;
   std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
@@ -297,13 +316,15 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
   }
 
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program,
-                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());
+  auto f =
+      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
+                &executor, program, &prefetch_var_name_to_prepared_ctx,
+                ckpt_pre_context, rpc_service_.get());
 
   f(request_send_handler_.get());
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
+  f(request_checkpoint_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -317,9 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
+                checkpoint_block_id);
   } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
   }
 }
 
@@ -349,6 +371,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
+    AddAttr<int>(kCheckpointBlockId,
+                 "BolckID to run save checkpoint on pserer.")
+        .SetDefault(-1);
   }
 };
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 634c1b4f4b541be9f4950a9ef48f944863486705..978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -32,6 +32,7 @@ namespace operators {
 
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
+constexpr char kCheckpointBlockId[] = "checkpint_block_id";
 
 void RunServer(std::shared_ptr<distributed::RPCServer> service);
 
@@ -47,10 +48,12 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
-                   const std::vector<int>& prefetch_block_id_list) const;
+                   const std::vector<int>& prefetch_block_id_list,
+                   const int checkpoint_point_block_id) const;
 
   void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
 
   void SavePort() const;
 
@@ -67,6 +70,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
       request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_checkpoint_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
 };
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 8f4b5049271c9592d2db268ea7ff2f5c8abc28b6..ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -34,6 +34,8 @@ class LoadOp : public framework::OperatorBase {
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     platform::RecordEvent record_event(Type(), dev_ctx);
 
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -44,9 +46,25 @@ class LoadOp : public framework::OperatorBase {
     PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                    out_var_name);
 
-    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
 
-    DeserializeFromStream(fin, tensor, *dev_ctx);
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
 
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
     auto in_dtype = framework::ToDataType(tensor->type());
@@ -63,18 +81,27 @@ class LoadOp : public framework::OperatorBase {
                                &fp16_tensor);
 
       // reset output tensor
-      out_var->Clear();
-      tensor = out_var->GetMutable<framework::LoDTensor>();
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(fp16_tensor.lod());
       tensor->ShareDataWith(fp16_tensor);
     }
   }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+  }
 };
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "The tensor need to be loaded");
+    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
     AddAttr<bool>(
         "load_as_fp16",
         "If true, the tensor will be first loaded and then "
@@ -85,7 +112,9 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
-    AddComment("Load operator will load a tensor variable from disk file.");
+    AddComment(
+        "Load operator will load a LoDTensor / SelectedRows variable from disk "
+        "file.");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 6207d14ecdc922cbca2d05d20e4b8a9da9b9d627..a907d6a71b7a16983e601073b039b48406853a0b 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -18,10 +18,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_service.h>
-#include <mkl_vml_functions.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
@@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) {
   openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
-  mkl_set_num_threads(real_num_threads);
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index ae20406bc21d5e08359be8295cd98495dda7813b..2ce94cfc93823aa891114ef8fd1e851727ebc623 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -22,61 +22,109 @@ namespace math {
 template <typename T>
 struct CBlas;
 
+#ifdef PADDLE_WITH_MKLML
 template <>
 struct CBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
+    platform::dynload::cblas_sgemm(args...);
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
+    platform::dynload::cblas_saxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    platform::dynload::cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    platform::dynload::cblas_sgemv(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    platform::dynload::cblas_sgemm_batch(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    vsAdd(args...);
+    platform::dynload::vsAdd(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    platform::dynload::cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    platform::dynload::cblas_daxpy(args...);
   }
-#endif
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
+    platform::dynload::cblas_dcopy(args...);
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
+    platform::dynload::cblas_dgemv(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
-    cblas_sgemm_batch(args...);
+    platform::dynload::cblas_dgemm_batch(args...);
+  }
+
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    platform::dynload::vdAdd(args...);
   }
-#endif
 };
 
+#else
+
 template <>
-struct CBlas<double> {
+struct CBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
+    cblas_sgemm(args...);
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
+    cblas_saxpy(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    vdAdd(args...);
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_sgemv(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_daxpy(args...);
   }
-#endif
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
@@ -87,15 +135,8 @@ struct CBlas<double> {
   static void GEMV(ARGS... args) {
     cblas_dgemv(args...);
   }
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    cblas_dgemm_batch(args...);
-  }
-#endif
 };
-
+#endif
 template <>
 struct CBlas<platform::float16> {
   static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 8b296b6a07ca222ddc08fedfd2eed423b46dc5c3..56a039d3cec7375517573c9429801945bf99741e 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index 528a6e4a1b68fe611d104f21bafe970762611a03..123fa44fa3ddbc9343b9629be63fdefdf12b4646 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("SeedOut", "The random seed after random cropping.")
         .AsIntermediate();
     AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
     AddComment(R"DOC(
       This operator takes a batch of instance, and do random cropping on each instance.
       It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
       out_dim[x_i] = shape[shape_i];
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
   }
 };
 
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index f3261cbdc986b0cc724315c1eb92b8b84e18c742..d68ba9d661698bb0d33b139f5748daec2ead6595 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
     int64_t seed = 0;
-    if (platform::is_cpu_place(seed_tensor.place())) {
-      seed = *seed_tensor.data<int64_t>();
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
+      if (platform::is_cpu_place(seed_tensor.place())) {
+        seed = *seed_tensor.data<int64_t>();
+      } else {
+        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                        "your program";
+        framework::LoDTensor cpu_seed;
+        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+        seed = *cpu_seed.data<int64_t>();
+      }
     } else {
-      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                      "your program";
-      framework::LoDTensor cpu_seed;
-      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-      seed = *cpu_seed.data<int64_t>();
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
     }
     auto shape = ctx.Attr<std::vector<int>>("shape");
     auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
     engine.discard(functor.prod_batchsize_dims_ *
                    (functor.rank_ - functor.num_batchsize_dims_));
     *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
   }
 };
 
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 62532036f86bfb82465ccd9e0ec526299489932a..a39c8a00538875e4e3284898230a6cb0693b7a12 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 71684b14176edc8f71efbefa9a7decffc8f3011e..db8cf3b605c9175eeda4548b1e7c8203f26c5d89 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -88,24 +88,29 @@ class BlockingQueue {
     receive_cv_.notify_all();
   }
 
-  bool IsClosed() {
+  bool IsClosed() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return closed_;
   }
 
-  size_t Cap() {
+  size_t Cap() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return capacity_;
   }
 
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
  private:
   size_t capacity_;
   bool closed_;
   std::deque<T> queue_;
 
-  std::mutex mutex_;
-  std::condition_variable receive_cv_;
-  std::condition_variable send_cv_;
+  mutable std::mutex mutex_;
+  mutable std::condition_variable receive_cv_;
+  mutable std::condition_variable send_cv_;
 };
 }  // namespace reader
 }  // namespace operators
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 0a02fcdeaa5a6de97d59ddce4f58ad945aa2572a..a75c6d4c567ac93f37b38070421133af305f20a3 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
   const framework::ProgramDesc program_;
   int sub_block_id_;
   framework::Executor exe_;
+  framework::Scope scope_;
 
   std::vector<std::string> source_var_names_;
   std::vector<std::string> sink_var_names_;
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   // The scope for CustomReader's sub-block should be independent and shouldn't
   // be any other computation scope's child. Otherwise, data preprocessing and
   // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
   // 1. Copy LoDTensors from underlying reader's output to source variables.
   for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
     framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
     tensor->ShareDataWith(underlying_outs[i]);
     tensor->set_lod(underlying_outs[i].lod());
   }
   // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
   // 3. Copy LoDTensors from sink variables to out.
   out->resize(sink_var_names_.size());
   for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                              .Get<framework::LoDTensor>();
     framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
   }
+  scope_.DeleteScope(exe_scope);
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5f35b9b3eac1d9aab8662833c6e39d12f11a0087..5f734489a81764875988f440696682570ff4d1d7 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {
 
 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
+static constexpr size_t kCacheSize = 5;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36587360f7347a10e01d4e994482027d9a9bb5d0
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    queue_ = queue;
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+
+  void ReInit() override {}
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(
+        queue_holder_var != nullptr,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+
+class CreatePyReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+
+    AddComment(R"DOC(
+			Create PyReader to support LoDTensor data feeding in Python side.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
+                              reader::CreatePyReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..30d962ba10a954a837f9771d21cedf0feb643439
--- /dev/null
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+
+ private:
+  LoDTensorBlockingQueue(size_t capacity,
+                         const std::vector<framework::DDim>& dims)
+      : queue_(capacity), dims_(dims) {}
+
+ public:
+  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(lod_tensor_vec);
+  }
+
+  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(std::move(lod_tensor_vec));
+  }
+
+  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    bool success = queue_.Receive(&lod_tensor_vec);
+    if (ok != nullptr) *ok = success;
+    return lod_tensor_vec;
+  }
+
+  inline size_t Cap() const { return queue_.Cap(); }
+
+  inline size_t Size() const { return queue_.Size(); }
+
+  inline void Close() { return queue_.Close(); }
+
+  inline bool IsClosed() const { return queue_.IsClosed(); }
+
+ private:
+  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
+                   "Expect input size is %d but found %s", dims_.size(),
+                   lod_tensor_vec.size());
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      const auto& in_dims = framework::slice_ddim(
+          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
+      const auto& expect_dims =
+          framework::slice_ddim(dims_[i], 1, dims_[i].size());
+      PADDLE_ENFORCE(in_dims == expect_dims,
+                     "Dims of the %d-th input tensor do not match", i);
+    }
+  }
+
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(
+        queue_ == nullptr,
+        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
+    return queue_;
+  }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index c4fcc61af4b75e6dc7d5c31e20c5fff358637af5..ccaea0eef2906953d922e097348b6c0a86dad6f1 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -139,6 +139,7 @@ TEST(LoadFP16Op, CPU) {
   save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
+  load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
   load_op->Run(scope, place);
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e6d27e2dedd7668b93bd8ddc330a897d1c6fa732..201a51130d6b6f94104e2dabf9e7facffa672ae0 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
 // TODO(yuyang18): If the functions below are needed by other files, move them
 // to paddle::filesystem namespace.
 constexpr char kSEP = '/';
@@ -67,9 +73,27 @@ class SaveOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    if (var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(place, var);
+    } else if (var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(scope, place, var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+
+  void SaveLodTensor(const platform::Place &place,
+                     framework::Variable *var) const {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
 
     if (FileExists(filename) && !overwrite) {
       PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
@@ -78,26 +102,19 @@ class SaveOp : public framework::OperatorBase {
 
     MkDirRecursively(DirName(filename).c_str());
 
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "SaveOp only support LoDTensor, %s has wrong type", iname);
-
     auto &tensor = var->Get<framework::LoDTensor>();
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
     auto in_dtype = framework::ToDataType(tensor.type());
     auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
@@ -112,17 +129,43 @@ class SaveOp : public framework::OperatorBase {
     } else {
       framework::SerializeToStream(fout, tensor, dev_ctx);
     }
+    fout.close();
+  }
+
+  void SaveSelectedRows(const framework::Scope &scope,
+                        const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+    PADDLE_ENFORCE(
+        lt_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+
+    auto &selectedRows = var->Get<framework::SelectedRows>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
   }
 };
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor ) Input tensor to be saved");
+    AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved");
     AddComment(R"DOC(
 Save operator
 
-This operator will serialize and write a tensor variable to file on disk.
+This operator will serialize and write LoDTensor / SelectedRows variable to file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
                   "(boolean, default true)"
@@ -142,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk.
   }
 };
 
+class SaveOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SaveOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
+REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
+                  ops::SaveOpShapeInference);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index d62c387c3eebf9df0ab532f4e891da006f239468..39301e1ac0971dfe0ca7854257f10ddeb60f1000 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* dx) {
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, dx, static_cast<T>(0));
-
     int dout_offset = 0;
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
     auto& y_lod = y->lod();
     if (ref_level == -1) ref_level = y_lod.size() - 1;
     // just copy the gradient
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index c703d11eeccf8418250f00c801f47418ee9c85ae..a2d44284e9de1ace42cabbce82e0b45929432d7b 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->set_lod(x_tensor.lod());
     if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
-
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                   "nothing has been written to output array["
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 295d6ba0395b68cabab3bd4117cedd912df48f5d..1602a913aeebe43fabe2f9c9036edd18ac4c70fd 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 358e2d151bb8f990503ea8a51ba5f81e0a1dc816..82a16361e40513aeaf6f510e450f58989369fcdb 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
   *var = *desc.Proto();
 }
 
-template <typename T>
-void SetAttr(framework::proto::OpDesc* op, const std::string& name,
-             const T& data);
-
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
-                          const std::string& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
-                  const int& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
-                      const int64_t& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-template <>
-void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
-                                       const std::string& name,
-                                       const std::vector<std::string>& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
-  for (const auto& s : data) {
-    attr->add_strings(s.c_str());
-  }
-}
-
 }  // namespace
 
+using inference::analysis::SetAttr;
+
 TEST(TensorRTEngineOp, manual) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 6dd19aaeffef8aa8a7d1997915908af04273d50c..9da787a4073fa002f75154f7c4fba54e9ed8efa6 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -17,3 +17,7 @@ if (CUPTI_FOUND)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+if (WITH_MKLML)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+endif()
+# TODO(TJ): add iomp, mkldnn?
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b..198d8566b1bd726c5b33d8af22a19cb30a280fa2 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -49,6 +49,8 @@ DEFINE_string(
     tensorrt_dir, "",
     "Specify path for loading tensorrt library, such as libnvinfer.so.");
 
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -76,6 +78,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
@@ -97,6 +100,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
   }
 #endif
 
+  if (nullptr == dso_handle) {
+    LOG(WARNING) << "Can not find library: " << dso_path
+                 << ". Please try to add the lib path to LD_LIBRARY_PATH.";
+  }
   return dso_handle;
 }
 
@@ -206,6 +213,14 @@ void* GetTensorRtDsoHandle() {
 #endif
 }
 
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 0de3559b6088086cb52c254535b6ec42da7dd724..ca87dc47f355a8a4fc840262044413414edf00a0 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle();
 void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f61a5e09b3243cbdf570ba7c28a260f181d8848
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mklml.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
new file mode 100644
index 0000000000000000000000000000000000000000..17acefe8cde01809572e4c86cbdccfed9a477a51
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mkl.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag mklml_dso_flag;
+extern void* mklml_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using mklmlFunc = decltype(&::__name);                               \
+      std::call_once(mklml_dso_flag, []() {                                \
+        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
+      });                                                                  \
+      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
+    }                                                                      \
+  };                                                                       \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(MKL_Set_Num_Threads)
+
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+
+#undef DYNAMIC_LOAD_MKLML_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ed99932546446eb877c9701de15e2d37d29b5f88..a6cccc31219104767ac38bdebeb1d4c0e8c2ac01 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -228,7 +228,7 @@ class MKLDNNHandler {
       return dstr;
     };
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +237,15 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5a45e431df993febab676f22da7116d84e441548..36d080996831d4ad90d92baeafbe964693e2332a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -297,6 +298,37 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ReInit);
 
+  using LoDTensorBlockingQueue =
+      ::paddle::operators::reader::LoDTensorBlockingQueue;
+  using LoDTensorBlockingQueueHolder =
+      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+      .def("push",
+           [](LoDTensorBlockingQueue &self,
+              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             pybind11::gil_scoped_release release;
+             return self.Push(lod_tensor_vec);
+           })
+      .def("size", &LoDTensorBlockingQueue::Size)
+      .def("capacity", &LoDTensorBlockingQueue::Cap)
+      .def("close", &LoDTensorBlockingQueue::Close)
+      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
+
+  m.def("init_lod_tensor_blocking_queue",
+        [](Variable &var, size_t capacity,
+           const std::vector<std::vector<int64_t>> &shapes)
+            -> LoDTensorBlockingQueue * {
+              std::vector<DDim> dims(shapes.size());
+              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                             [](const std::vector<int64_t> &shape) {
+                               return make_ddim(shape);
+                             });
+              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+              holder->InitOnce(capacity, dims);
+              return holder->GetQueue().get();
+            },
+        py::return_value_policy::reference);
+
   py::class_<Scope>(m, "Scope", "")
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
@@ -463,9 +495,11 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_DISTRIBUTE
       .def("complete", &Executor::Complete)
 #endif
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
+      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
+                     int block_id, bool create_local_scope, bool create_vars) {
+        pybind11::gil_scoped_release release;
+        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+      });
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
@@ -631,7 +665,12 @@ All parameter, weight, gradient are variables in Paddle.
            &ParallelExecutor::FeedTensorsIntoLocalScopes)
       .def("feed_and_split_tensor_into_local_scopes",
            &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", &ParallelExecutor::Run);
+      .def("run", [](ParallelExecutor &self,
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
+        pybind11::gil_scoped_release release;
+        self.Run(fetch_tensors, fetched_var_name);
+      });
 
   BindRecordIOWriter(&m);
   return m.ptr();
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6da3846ac69980daac4f0fb7401b2573c21c89bf..3e2ea1ef88b03f5b2576c1cee2b5d26a439943da 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCPUTensorSetFromArray(
+inline void PyCPUTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDATensorSetFromArray(
+inline void PyCUDATensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDAPinnedTensorSetFromArray(
+inline void PyCUDAPinnedTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 037688bde9122c1d999e90f2438977b46c1eb531..b66a05aaebda645196721fd6ed840e5584813348 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,6 +106,8 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -133,7 +135,8 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
 function abort(){
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6a1b8b5fac223c0d134cae69a61a0c2c00bc1feb..9d05aeeb95c4f936cb773ece20407ecb32cbbf21 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -111,7 +111,7 @@ def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -118,7 +118,8 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index dc275674618ee147dad2e32c7db29132ab55eb29..145f1423e4b4a2ce35ba8ac3cca37935df90727e 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -78,6 +78,8 @@ def as_numpy(tensor):
     Returns:
         numpy.ndarray
     """
+    if isinstance(tensor, core.LoDTensorArray):
+        return [as_numpy(t) for t in tensor]
     if isinstance(tensor, list):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4c1c8443a641cde40c392f1c647bc78d6cd3c13c..9dcd907451dacaf95a7fe0d3a510241bc3da7f95 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,6 +27,7 @@ __all__ = [
     'Variable',
     'Program',
     'Operator',
+    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
@@ -454,7 +455,7 @@ class Operator(object):
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
         'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
         'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select', 'gen_nccl_id'
+        'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
@@ -559,19 +560,8 @@ class Operator(object):
                         self.attrs[attr_name] is None):
                     continue
                 attr_val = self.attrs[attr_name]
-                if isinstance(attr_val, Block):
-                    self.desc.set_block_attr(attr_name,
-                                             self.attrs[attr_name].desc)
-                elif isinstance(attr_val, list) and attr_val and \
-                      all(isinstance(v, Block) for v in attr_val):
-                    self.desc.set_blocks_attr(attr_name,
-                                              [v.desc for v in attr_val])
-                elif isinstance(attr_val, core.BlockDesc) or \
-                        isinstance(attr_val, core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, attr_val.serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, attr_val)
+                self._update_desc_attr(attr_name, attr_val)
+
         self.desc.check_attrs()
         if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
@@ -718,6 +708,19 @@ class Operator(object):
             ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
         self.attrs[name] = val
+        self._update_desc_attr(name, val)
+
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
         if isinstance(val, Block):
             self.desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and all(
@@ -1212,6 +1215,9 @@ class Block(object):
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
             ret_var = self.create_var(
                 name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.RAW:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
         elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
             ret_var = self.create_var(
                 name=var.name,
@@ -1917,11 +1923,11 @@ def program_guard(main_program, startup_program=None):
 def get_var(name, program=None):
     """
     Get a variable by name from the global block of a program.
-    
+
     Args:
         name(str): name of the variable
         program(Program|None): program object.
-             If None, default_global_program() will be used.
+        If None, default_global_program() will be used.
 
     Returns:
         Variable
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 6e527572f1ca77be9fe069654db00d16ad5c21ef..d94564e11f982575dd9c065deb20d29396203227 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import errno
 import time
 import shutil
 
@@ -25,7 +26,8 @@ __all__ = [
     'load_persistables', 'save_inference_model', 'load_inference_model',
     'get_inference_program', 'save_checkpoint', 'load_checkpoint',
     'clean_checkpoint', 'load_persist_vars_without_grad',
-    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
+    'load_lookup_table_vars', 'save_persist_vars_without_grad',
+    'get_latest_checkpoint_serial'
 ]
 
 
@@ -795,6 +797,7 @@ def get_parameter_value_by_name(name, executor, program=None):
 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
 MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
 TRAINER_PREFIX = "trainer"
 CHECKPOINT_SEPARATOR = "_"
 
@@ -804,7 +807,9 @@ def save_checkpoint(executor,
                     trainer_id,
                     trainer_args=None,
                     main_program=None,
-                    max_num_checkpoints=3):
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    ps_endpoint_list=None):
     """
     This function filters out all checkpoint variables from the give
     main_program and then saves these variables to the `checkpoint_dir` 
@@ -836,6 +841,12 @@ def save_checkpoint(executor,
         max_num_checkpoints(int): The max number of total number of existing 
             checkpoints.
             Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list|None): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
 
     Returns:
         None
@@ -852,30 +863,40 @@ def save_checkpoint(executor,
             prog = fluid.default_main_program()
             trainer_args = {"epoch_id": 200,
                             "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
             fluid.io.save_checkpoint(executor=exe,
                                      checkpoint_dir=path,
                                      trainer_id=0,
                                      trainer_args=trainer_args,
                                      main_program=prog,
-                                     max_num_checkpoints=3)
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     ps_endpoint_list = ps_endpoints)
     """
     if checkpoint_dir is None:
         raise ValueError("'checkpoint_dir' should not be None")
+    assert checkpoint_dir
 
     if trainer_args:
         assert isinstance(trainer_args, dict)
 
-    if not os.path.isdir(checkpoint_dir):
-        os.makedirs(checkpoint_dir)
+    is_chief = trainer_id == 0
 
+    _make_chekcpoint_dirs(checkpoint_dir)
     serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
     save_trainer_args(cur_dir, trainer_id, trainer_args)
 
-    if trainer_id == 0:
+    if is_chief:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
 
+    if is_chief and lookup_table and ps_endpoint_list:
+        save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                    ps_endpoint_list)
+
     _scroll_delete(checkpoint_dir, max_num_checkpoints)
 
 
@@ -942,8 +963,9 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
 
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
     """
-    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.
+    clean the checkpoint dir, when the train exits normally, 
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
 
     : param checkpoint_dir
     : param delete_dir
@@ -1009,6 +1031,56 @@ def load_persist_vars_without_grad(executor,
         filename=None)
 
 
+def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in 
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/__model__"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            fluid.io.load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
 def save_persist_vars_without_grad(executor, dirname, program):
     """
     This function filters out all checkpoint variables from the give
@@ -1055,6 +1127,54 @@ def save_persist_vars_without_grad(executor, dirname, program):
     _write_success(cur_dir)
 
 
+def save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name, 
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
+    Return:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            fluid.io.save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name, 
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
 def save_trainer_args(dirname, trainer_id, trainer_args):
     assert isinstance(trainer_args, dict)
 
@@ -1068,6 +1188,29 @@ def save_trainer_args(dirname, trainer_id, trainer_args):
 
 
 def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory, 
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
     assert isinstance(trainer_args, list)
 
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
@@ -1088,7 +1231,7 @@ def _is_checkpoint_var(var):
     the checkpoint will not save or load all the variables.
     var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
 
-    : param var
+    : param var(Variable)
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
@@ -1108,6 +1251,23 @@ def _is_checkpoint_var(var):
     return var.persistable
 
 
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
 def _get_dir_serial(dirname):
     _, serial = dirname.split(CHECKPOINT_SEPARATOR)
 
@@ -1121,29 +1281,27 @@ def _get_dir_serial(dirname):
 def _get_serial_dir(dirname, serial):
     serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
     serial_dir = os.path.join(dirname, serial_folder)
-
-    if not os.path.isdir(serial_dir):
-        os.makedirs(serial_dir)
+    _make_chekcpoint_dirs(serial_dir)
 
     return serial_dir
 
 
 def _get_model_dir(dirname):
     model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
 
-    if not os.path.isdir(model_dir):
-        os.makedirs(model_dir)
 
-    return model_dir
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
 
 
 def _get_trainer_dir(dirname, trainer_id):
     trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
     trainer_dir = os.path.join(dirname, trainer_folder)
-
-    if not os.path.isdir(trainer_dir):
-        os.makedirs(trainer_dir)
-
+    _make_chekcpoint_dirs(trainer_dir)
     return trainer_dir
 
 
@@ -1162,7 +1320,11 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
     serials = serials[max_num_checkpoints:]
     for serial in serials:
         cur_dir = _get_serial_dir(dirname, serial)
-        shutil.rmtree(cur_dir)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
 
 
 def _write_success(dirname):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f3ab47c96b1caa2facfd6d191af014b4c7380cbc..f33ae76aea95ceeca73c5bae6e4e490cdff29bf3 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -110,7 +110,7 @@ class BlockGuardServ(BlockGuard):
 class ListenAndServ(object):
     """
     **ListenAndServ Layer**
-    
+
     ListenAndServ is used to create a rpc server bind and listen
     on specific TCP port, this server will run the sub-block when
     received variables from clients.
@@ -212,7 +212,7 @@ def Send(endpoints, send_vars, sync=True):
                    of send_vars to send
         send_vars (list): variables to send to server
         sync (bool): whether to wait the request finish
-    
+
     """
     assert (type(send_vars) == list)
 
@@ -469,10 +469,13 @@ def open_files(filenames,
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
        thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int): The size of prefetch buffer.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
        pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run 
             subsequent operators in parallel.
+            Default: True
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -492,7 +495,7 @@ def open_files(filenames,
          image, label = fluid.layers.io.read_file(reader)
     """
     if buffer_size is None:
-        buffer_size = thread_num
+        buffer_size = thread_num * 3
     if isinstance(filenames, basestring):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index be22bde4608807aff12ae8fa4b4c723211ffecce..bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
 import random
+from .. import unique_name
 
 __all__ = [
     'fc',
@@ -1992,7 +1993,8 @@ def batch_norm(input,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None,
-               do_model_average_for_mean_and_var=False):
+               do_model_average_for_mean_and_var=False,
+               fuse_with_relu=False):
     """
     **Batch Normalization Layer**
 
@@ -2035,6 +2037,7 @@ def batch_norm(input,
         moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2120,7 +2123,8 @@ def batch_norm(input,
             "momentum": momentum,
             "epsilon": epsilon,
             "is_test": is_test,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": use_mkldnn,
+            "fuse_with_relu": fuse_with_relu
         })
 
     return helper.append_activation(batch_norm_out)
@@ -2223,56 +2227,6 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
-def beam_search_decode(ids, scores, name=None):
-    """
-    Beam Search Decode
-
-    This layers is to pack the output of beam search layer into sentences and
-    associated scores. It is usually called after the beam search layer.
-    Typically, the output of beam search layer is a tensor of selected ids, with
-    a tensor of the score of each id. Beam search layer's output ids, however,
-    are generated directly during the tree search, and they are stacked by each
-    level of the search tree. Thus we need to reorganize them into sentences,
-    based on the score of each id. This layer takes the output of beam search
-    layer as input and repack them into sentences.
-
-    Args:
-        ids (Variable): The selected ids, output of beam search layer.
-        scores (Variable): The associated scores of the ids, out put of beam
-            search layer.
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
-        sentence_ids is a tensor with shape [size, length], where size is the
-        beam size of beam search, and length is the length of each sentence.
-        Note that the length of sentences may vary.
-        sentence_scores is a tensor with the same shape as sentence_ids.
-
-    Examples:
-        .. code-block:: python
-
-            ids, scores = fluid.layers.beam_search(
-                pre_ids, ids, scores, beam_size, end_id)
-            sentence_ids, sentence_scores = fluid.layers.beam_search_decode(
-                ids, scores)
-    """
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
 def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
@@ -2383,10 +2337,17 @@ def conv2d_transpose(input,
           data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
           conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
-    helper = LayerHelper("conv2d_transpose", **locals())
+
+    input_channel = input.shape[1]
+
+    op_type = 'conv2d_transpose'
+    if (input_channel == groups and num_filters == input_channel and
+            not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+
+    helper = LayerHelper(op_type, **locals())
     if not isinstance(input, Variable):
         raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
 
     padding = utils.convert_to_list(padding, 2, 'padding')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -2420,7 +2381,7 @@ def conv2d_transpose(input,
 
     pre_bias = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type='conv2d_transpose',
+        type=op_type,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
         outputs={'Output': pre_bias},
@@ -2676,38 +2637,89 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
-def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
-    '''
-    **beam search**
-
-    This function implements the beam search algorithm.
-
-    Beam search is a classical algorithm for selecting candidate words
-    in a machine translation task.
+def beam_search(pre_ids,
+                pre_scores,
+                ids,
+                scores,
+                beam_size,
+                end_id,
+                level=0,
+                name=None):
+    """
+    Beam search is a classical algorithm for selecting candidate words in a
+    machine translation task.
 
     Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
     for more details.
+    
+    This layer does the search in beams for one time step. Specifically, it 
+    selects the top-K candidate word ids of current step from :attr:`ids`
+    according to their :attr:`scores` for all source sentences, where K is
+    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
+    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
+    the output of beam_search at previous step, they are needed for special use
+    to handle ended candidate translations.
+ 
+    Note that the :attr:`scores` passed in should be accumulated scores, and
+    length penalty should be done with extra operators before calculating the
+    accumulated scores if needed, also suggest finding top-K before it and
+    using the top-K candidates following.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
 
     Args:
-        pre_ids (Variable): ids in previous step.
-        ids (Variable): a LoDTensor of shape of [None,k]
-        scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
-        beam_size (int): beam size for beam search
-        end_id (int): the token id which indicates the end of a sequence
-        level (int): the level of LoDTensor
+        pre_ids(Variable): The LodTensor variable which is the output of
+            beam_search at previous step. It should be a LodTensor with shape
+            :math:`(batch_size, 1)` and lod
+            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
+            first step.
+        pre_scores(Variable): The LodTensor variable which is the output of
+            beam_search at previous step.
+        ids(Variable): The LodTensor variable containing the candidates ids.
+            Its shape should be :math:`(batch_size \\times beam_size, K)`,
+            where :math:`K` supposed to be :attr:`beam_size`.
+        scores(Variable): The LodTensor variable containing the accumulated
+            scores corresponding to :attr:`ids` and its shape is the same as
+            the shape of :attr:`ids`.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int, default 0): It can be ignored and mustn't change currently.
+            It means the source level of lod, which is explained as following.
+            The lod level of :attr:`ids` should be 2. The first level is source
+            level which describes how many prefixes (branchs) for each source
+            sentece (beam), and the second level is sentence level which
+            describes how these candidates belong to the prefix. The paths
+            linking prefixes and selected candidates are organized and reserved
+            in lod.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
 
     Returns:
-        tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
+        Variable: The LodTensor pair containing the selected ids and the \
+            corresponding scores.
 
     Examples:
         .. code-block:: python
 
-             # current_score is a Tensor of shape (num_batch_size, embed_size), which
-             # consists score of each candidate word.
-             topk_scores, topk_indices = pd.topk(current_score, k=50)
-             selected_ids, selected_scores = pd.beam_search(
-                 pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
-    '''
+            # Suppose `probs` contains predicted results from the computation
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
+            # at previous step.
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
     helper = LayerHelper('beam_search', **locals())
     score_type = scores.dtype
     id_type = ids.dtype
@@ -2719,6 +2731,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         type='beam_search',
         inputs={
             'pre_ids': pre_ids,
+            'pre_scores': pre_scores,
             'ids': ids,
             'scores': scores,
         },
@@ -2736,6 +2749,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
     return selected_ids, selected_scores
 
 
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    Beam Search Decode Layer. This layer constructs the full hypotheses for
+    each source sentence by walking back along the LoDTensorArray :attr:`ids`
+    whose lods can be used to restore the path in the beam search tree.
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        ids(Variable): The LodTensorArray variable containing the selected ids
+            of all steps.
+        scores(Variable): The LodTensorArray variable containing the selected
+            scores of all steps.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the generated id sequences \
+            and the corresponding scores. The shapes and lods of the two \
+            LodTensor are same. The lod level is 2 and the two levels \
+            separately indicate how many hypotheses each source sentence has \
+            and how many ids each hypothesis has.
+
+    Examples:
+        .. code-block:: python
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            finished_ids, finished_scores = layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
+
+    return sentence_ids, sentence_scores
+
+
 def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
@@ -4266,14 +4329,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                 say :attr:`actual_shape` has a higher priority
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, a new output tensor is created
-                       whose data is copied from input x, otherwise the output
-                       shares data with input without copying.
+        inplace(bool): If this flag is set true, the output
+                       shares data with input without copying, otherwise
+                       a new output tensor is created
+                       whose data is copied from input x.
         name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: The output tensor.
 
+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
+
     Examples:
         .. code-block:: python
 
@@ -4285,6 +4352,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
         raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")
 
     # Validate the shape
     unk_dim_idx = -1
@@ -4305,9 +4377,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     reshaped = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
-        inputs={"X": x,
-                "Shape": actual_shape}
-        if isinstance(actual_shape, Variable) else {"X": x},
+        inputs=inputs,
         attrs={"shape": shape,
                "inplace": inplace},
         outputs={"Out": reshaped})
@@ -4889,47 +4959,39 @@ def random_crop(x, shape, seed=None):
         >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
     """
     helper = LayerHelper("random_crop", **locals())
-    dtype = helper.input_dtype()
+    dtype = x.dtype
     out = helper.create_tmp_variable(dtype)
     if seed is None:
         seed = random.randint(-65536, 65535)
-
+    op_attrs = {"shape": shape}
     if isinstance(seed, int):
-        seed_value = seed
-        seed = helper.create_tmp_variable(dtype="int64")
-        helper.append_op(
-            type="fill_constant",
-            inputs={},
-            outputs={"Out": seed},
-            attrs={
-                "dtype": seed.dtype,
-                "shape": [1],
-                "value": float(seed_value),
-                "force_cpu": True
-            })
+        op_attrs["startup_seed"] = seed
+        seed = helper.create_variable(
+            name=unique_name.generate("random_crop_seed"),
+            dtype="int64",
+            persistable=True)
     elif not isinstance(seed, Variable):
         raise ValueError("'seed' must be a Variable or an int.")
-    seed_out = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="random_crop",
         inputs={"X": x,
                 "Seed": seed},
         outputs={"Out": out,
-                 "SeedOut": seed_out},
-        attrs={"shape": shape})
+                 "SeedOut": seed},
+        attrs=op_attrs)
     return out
 
 
-def log(input):
+def log(x):
     """
     Calculates the natural log of the given input tensor, element-wise.
 
     .. math::
 
-        Out = \\ln(input)
+        Out = \\ln(x)
 
     Args:
-        input (Variable): Input tensor.
+        x (Variable): Input tensor.
 
     Returns:
         Variable: The natural log of the input tensor computed element-wise.
@@ -4938,7 +5000,7 @@ def log(input):
 
         .. code-block:: python
 
-            output = fluid.layers.log(input)
+            output = fluid.layers.log(x)
     """
     helper = LayerHelper('log', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -4947,18 +5009,18 @@ def log(input):
     return out
 
 
-def relu(input):
+def relu(x):
     """
     Relu takes one input data (Tensor) and produces one output data (Tensor)
-    where the rectified linear function, y = max(0, input), is applied to
+    where the rectified linear function, y = max(0, x), is applied to
     the tensor elementwise.
 
     .. math::
 
-        Out = \\max(0, input)
+        Out = \\max(0, x)
 
     Args:
-        input (Variable): The input tensor.
+        x (Variable): The input tensor.
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -4967,7 +5029,7 @@ def relu(input):
 
         .. code-block:: python
 
-            output = fluid.layers.relu(input)
+            output = fluid.layers.relu(x)
     """
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -5015,12 +5077,12 @@ def mean_iou(input, label, num_classes):
     out_correct = helper.create_tmp_variable(dtype='int32')
     helper.append_op(
         type="mean_iou",
-        inputs={"predictions": input,
-                "labels": label},
+        inputs={"Predictions": input,
+                "Labels": label},
         outputs={
-            "out_mean_iou": out_mean_iou,
-            "out_wrong": out_wrong,
-            "out_correct": out_correct
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct
         },
         attrs={"num_classes": num_classes})
     return out_mean_iou, out_wrong, out_correct
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 109a2694ad0d23ee35fb48810aba94842718fd6b..b6614ecf3bc16e73683f4991779769049c6800ed 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -156,7 +156,7 @@ def cast(x, dtype):
 
     Examples:
         .. code-block:: python
-             
+
             data = fluid.layers.data(name='x', shape=[13], dtype='float32')
             result = fluid.layers.cast(x=data, dtype='float64')
     """
@@ -189,7 +189,7 @@ def concat(input, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-        
+
            out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
@@ -239,7 +239,7 @@ def sums(input, out=None):
     return out
 
 
-def assign(input, output):
+def assign(input, output=None):
     """
     **Assign**
 
@@ -247,7 +247,7 @@ def assign(input, output):
 
     Args:
         input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable
 
     Returns:
         Variable: The destination variable that was supplied as the *output*.
@@ -260,6 +260,8 @@ def assign(input, output):
           fluid.layers.assign(hidden, out)
     """
     helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
     if isinstance(input, Variable):
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
@@ -443,7 +445,7 @@ def argmax(x, axis=0):
     return out
 
 
-def argsort(input, axis=-1):
+def argsort(input, axis=-1, name=None):
     """
     Performs sorting on the input Variable along the given axis, and outputs 
     sorted data Varibale and its corresponding index Variable with the same 
@@ -471,6 +473,8 @@ def argsort(input, axis=-1):
         axis(int): The axis along which to sort the input Variable. When 
                    :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
                    rank(:attr:`input`). Default -1, the last dimension.
+        name(str|None): (optional) A name for this layer. If set None, the 
+                   layer will be named automatically.
 
     Returns:
         tuple: A tuple of sorted data Variable and the sorted indices.
@@ -488,8 +492,8 @@ def argsort(input, axis=-1):
         type='argsort',
         inputs={'X': input},
         outputs={'Out': out,
-                 'Indics': ids},
-        attts={'axis': axis})
+                 'Indices': ids},
+        attrs={'axis': axis})
     return out, ids
 
 
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index c417ab393fca88d476d2f1fe83d12f99271d6883..b2b3186c1e8dd84e1527ff18744bd611f1f74c5f 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,15 +18,16 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def create_lod_tensor(data, lod, place):
+def create_lod_tensor(data, recursive_seq_lens, place):
     """
     Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based input lod is valid.
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
 
-    2. Convert the length-based lod to a offset-based LoD.
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
 
     3. Copy the data from a numpy array, a list or a existing lod tensor to
        CPU or GPU device (based on input place).
@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place):
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
         word is represented by an integer. If we want to create a LoDTensor to
-        represent two  sentences, one of 2 words, and one of 3 words.
+        represent two sentences, one of 2 words, and one of 3 words.
 
         Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based input lod [[2, 3]] will be converted to
-        offset-based lod [[0, 2, 5]] inside the function call.
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
+        offset-based LoD [[0, 2, 5]] inside the function call.
 
     Please reference :ref:`api_guide_low_level_lod_tensor` for more details
     regarding LoD.
 
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be  copied.
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+            list holding the data to be copied.
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+            info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info.
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), lod, place)
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
         # When input data is a list, it only deal with the case where the base element 
         # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
         # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
         # of words or other indexes in the sequence. 
-        new_lod = []
+        new_recursive_seq_lens = []
         for seq in data:
-            new_lod.append(len(seq))
-        assert [new_lod] == lod, "data and lod do not match"
+            new_recursive_seq_lens.append(len(seq))
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
         flattened_data = np.concatenate(data, axis=0).astype("int64")
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        return create_lod_tensor(flattened_data, lod, place)
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
     elif isinstance(data, np.ndarray):
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_recursive_sequence_lengths(lod)
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         assert tensor.has_valid_recursive_sequence_lengths(
         ), "the provided lod info is invalid"
         return tensor
@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place):
             "data should be either a LoDTensor, a Numpy array or a list")
 
 
-def create_random_int_lodtensor(lod, base_shape, place, low, high):
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
     """
     Create a LoDTensor containing random integers.
 
@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     The function does the following:
 
     1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`lod` input and the shape of the basic element in
+       :code:`recursive_seq_lens` input and the shape of the basic element in
        :code:`base_shape`.
 
     2. Create a numpy array of this shape.
@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
-    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
 
     Args:
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+        recursive_seq_lens(list): a list of lists indicating the length-based 
+            level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
         place(Place): CPU or GPU place indicating where the data in the new
@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
-    overall_shape = [sum(lod[-1])] + base_shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, lod, place)
+    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index c9cd881979a4ea4b14299ce219be4b5bd1f153fc..17bb0826a6ea86c98a069263dfab84b99e1177ad 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -596,12 +596,12 @@ class Auc(MetricBase):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
                 if lbl:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                         tp += 1
                     else:
                         fn += 1
                 else:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                         fp += 1
                     else:
                         tn += 1
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 607a68e2565a247612f0e7b307088f85be91825c..75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1113,7 +1113,6 @@ class ModelAverage(Optimizer):
 
     Args:
         average_window_rate: The rate of average window.
-        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
@@ -1122,8 +1121,8 @@ class ModelAverage(Optimizer):
       .. code-block:: python
 
         optimizer = fluid.optimizer.Momentum()
-        _, params_grads = optimizer.minimize(cost)
-        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+        optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(0.15,
                                                 min_average_window=10000,
                                                 max_average_window=20000)
         for pass_id in range(args.pass_num):
@@ -1137,7 +1136,6 @@ class ModelAverage(Optimizer):
 
     def __init__(self,
                  average_window_rate,
-                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -1146,21 +1144,16 @@ class ModelAverage(Optimizer):
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
 
-        self.params_grads = [] if params_grads is None else params_grads
-        params = {}
-        for param, grad in self.params_grads:
-            if param.do_model_average != False:
-                params[param.name] = (param, grad)
+        self.params_grads = []
         for param in framework.default_main_program().global_block(
         ).all_parameters():
-            if param.name not in params and param.do_model_average != False:
+            if param.do_model_average != False:
                 grad = param.block.create_var(
                     name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
                     persistable=False,
                     stop_gradient=True)
-                params[param.name] = (param, grad)
-        self.params_grads = params.values()
+                self.params_grads.append((param, grad))
 
         for param, grad in self.params_grads:
             self._append_average_accumulate_op(param)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 25cc1355d5a53e44b7f45c1f7d80673abcf567ec..6baf648198585022f992709c519038688af293e1 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -160,7 +160,7 @@ class ParallelExecutor(object):
             build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
-    def run(self, fetch_list, feed=None, feed_dict=None):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
         Run a parallel executor with fetch_list.
 
@@ -196,6 +196,8 @@ class ParallelExecutor(object):
                 to each device. Default None.
             feed_dict: Alias for feed parameter, for backward compatibility.
                 This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: True.
 
         Returns:
             List: The fetched result list.
@@ -270,6 +272,9 @@ class ParallelExecutor(object):
         if self.is_dist:
             self.bcast_params()
 
+        if return_numpy:
+            return executor.as_numpy(arr)
+
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 0ccb3a39e02ea0c24bdfe01c5eba73b92da88a04..67aa21e8c5699f1cb568dad23cd13f4cb51a6ec9 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname):
     inferencer = fluid.Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
-    # Setup inputs by creating LoDTensors to represent sequences of words.
-    # Here each word is the basic element of these LoDTensors and the shape of 
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensors will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_0 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     pred = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     mark = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
 
     results = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index c4b37df3a09f93fe965ae28ce783f06f5018020d..8becd2404b0201c44b587a28e88995958082cd28 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -127,9 +127,19 @@ def decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -138,10 +148,14 @@ def decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 
@@ -215,11 +229,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -243,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 090c11ce1e79201f0d65d3540527791ab2191d4a..c860f1641708d947fd2a8008d3d3ccd0a231f6c2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Use the first data from paddle.dataset.movielens.test() as input.
-    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
-    # where `data` is a list of sequences of index numbers, `lod` is 
-    # the level of detail (lod) info associated with `data`.
+    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+    # to generate LoD Tensor where `data` is a list of sequences of index 
+    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+    # (lod) info associated with `data`.
     # For example, data = [[10, 2, 3], [2, 3]] means that it contains
     # two sequences of indexes, of length 3 and 2, respectively.
-    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-    # indicating that `data` consists of two sequences of length 3 and 2. 
+    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+    # level of detail info, indicating that `data` consists of two sequences 
+    # of length 3 and 2, respectively. 
     user_id = fluid.create_lod_tensor([[1]], [[1]], place)
     gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
     age_id = fluid.create_lod_tensor([[0]], [[1]], place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 9b61f7a00ce5e2a08c2105fb7f50e6868ef25df3..1668ae83d3581125b799508c8c3115a038e93d5a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index aa7c567b4d66ba07c26d54436fb305011cfeccf2..8da89d82cb8e00853eebfd794602a0e1e1020e7c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 8c74be0f08855c20f5aa3ecd75622a51e94a0304..74faa2e8aa734cd644dfcc38127fd12df1fb1092 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index ba44f72d9b03c3a44560a8a30cba2253256314ef..02e65cf56c4d1bd262831320befd2edc735c0d1c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None):
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
     # is simply an index to look up for the corresponding word vector and hence 
-    # the shape of word (base_shape) should be [1]. The length-based level of 
-    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-    # one lod_level and there is only one sequence of one word on this level.
-    # Note that lod info should be a list of lists.
-    lod = [[1]]
+    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+    # meaning there is only one level of detail and there is only one sequence of 
+    # one word on this level.
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[1]]
     base_shape = [1]
     # The range of random integers is [low, high]
     first_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     second_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     third_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     fourth_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
     result = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 5d9a47c9ba3db07f240b42732536f1ea37627a11..1df7b99aad6094a8b8ddfe783b9de35cef61c524 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         tensor_words = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None):
                           feed={feed_target_names[0]: tensor_words},
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index e214ced0b5593c60ebd4a69edff1e961bcb4a72a..d489feae9c568ec1d9e3a230766d10d1ced0200a 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of 
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=pred_dict_len - 1)
         ctx_n2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_n1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_0 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=mark_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 372d6ec8223f69b69663137a646ba591108c40b7..90c301a66105d8d872ee531556c5060b5d727515 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
         pre_state = pd.array_read(array=state_array, i=counter)
         pre_score = pd.array_read(array=scores_array, i=counter)
 
-        # expand the lod of pre_state to be the same with pre_score
+        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
         pre_ids_emb = pd.embedding(
@@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 
@@ -238,11 +252,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -266,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 937d8dd5b065f0c1fdfc052b0342b572e3fbd7ac..6548766ef5d0162b50d4dd072e8e91dd95dc5d2b 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
-        # where `data` is a list of sequences of index numbers, `lod` is 
-        # the level of detail (lod) info associated with `data`.
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-        # indicating that `data` consists of two sequences of length 3 and 2. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # level of detail info, indicating that `data` consists of two sequences 
+        # of length 3 and 2, respectively. 
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 7ada57def6bfedb113ea1a56f9677116b80488ea..467282624154086a874b0e73736ed5b1358915ff 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for two sentences of 
         # length 4 and 6, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[4, 6]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[4, 6]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word_data = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
         trg_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 75bed06bd7a9b311ff9466589d6ecab2c37471ce..49bd72c7a53c0ae740bdbabe15b1d37340699d41 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None):
 
         # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
         # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The length-based level of 
-        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-        # one lod_level and there is only one sequence of one word on this level.
-        # Note that lod info should be a list of lists.
-        lod = [[1]]
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # meaning there is only one level of detail and there is only one sequence of 
+        # one word on this level.
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[1]]
         base_shape = [1]
         # The range of random integers is [low, high]
         first_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         second_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         third_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         fourth_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index b7e7f5801fbbe58626eeec5fc77736d04bb3cefb..f7a9dd4129027417a06a6c25ff9a801fff259c5e 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -19,18 +19,21 @@ import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_pybind_lod(self):
+    def test_pybind_recursive_seq_lens(self):
         tensor = fluid.LoDTensor()
-        lod = []
-        tensor.set_recursive_sequence_lengths(lod)
-        lod = [[], [1], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
-        lod = [[0], [2], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
+        recursive_seq_lens = []
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        recursive_seq_lens = [[], [1], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+        recursive_seq_lens = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
 
-        lod = [[1, 2, 3]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[1, 2, 3]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
@@ -38,13 +41,14 @@ class TestLoDTensor(unittest.TestCase):
 
         # Each level's sum should be equal to the number of items in the next level
         # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 2]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        tensor.set_recursive_sequence_lengths(lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
@@ -52,35 +56,42 @@ class TestLoDTensor(unittest.TestCase):
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
         data = [[1, 2, 3], [3, 4]]
-        wrong_lod = [[2, 2]]
-        correct_lod = [[3, 2]]
-        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
-                          fluid.CPUPlace())
-        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod)
+        wrong_recursive_seq_lens = [[2, 2]]
+        correct_recursive_seq_lens = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data,
+                          wrong_recursive_seq_lens, fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
+                                   fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         correct_recursive_seq_lens)
 
         # Create LoDTensor from numpy array
         data = np.random.random([10, 1])
-        lod = [[2, 1], [3, 3, 4]]
-        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
-        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
-        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
-        self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod)
+        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
+                                       fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(),
+                         new_recursive_seq_lens)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
         shape = [1]
-        lod = [[2, 3, 5]]
+        recursive_seq_lens = [[2, 3, 5]]
         dict_size = 10000
         low = 0
         high = dict_size - 1
-        tensor = create_random_int_lodtensor(lod, shape,
+        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         self.assertEqual(tensor.shape(), [10, 1])
 
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 219ab9bc2cc74a3c16f7bda69d4d782283574d7e..5f27864c140573086d07415f83caca708889a068 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -51,3 +51,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 829c5a1a5fd099543e9e98b9587d4f316a91b587..cddf00765f4894126988c794763c34629449e8e6 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -81,7 +83,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             begin = time.time()
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            first_loss = np.array(first_loss)
 
             for i in xrange(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
@@ -94,7 +95,11 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
-            last_loss = np.array(last_loss)
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
 
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index 1d0aa82a6b398592dc5b905be56591a1094b9e41..b29a102a3880406156481fdac54ca7043d3415db 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -20,7 +20,7 @@ from op_test import OpTest
 class TestArgsortOp(OpTest):
     def setUp(self):
         self.init_axis()
-        x = np.random.random((2, 3, 4, 5)).astype("float32")
+        x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
         if self.axis < 0:
             self.axis = self.axis + len(x.shape)
         self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
index f6097d4b846e8da1c4ee3cc49b31f9873660056d..18fa5461590134d2032a29e40699109c12092c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
         self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
 
 
+class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.fuse_with_relu = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 01e5749bdb9729c697af1ae87d993a2da66217f8..a62ee9596d0f6c58135b4a13249b638e84e63c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase):
         scale_shape = [c]
 
         x_val = np.random.random_sample(x_shape).astype(dtype)
+        # generate some negative values to test case with relu fused
+        x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
         bias_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase):
 
         y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
                                    epsilon, data_layout).astype(dtype)
+        if self.fuse_with_relu:
+            y_out = np.maximum(y_out, 0)
 
         scope = core.Scope()
 
@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase):
             is_test=True,
             data_layout=data_layout,
             use_mkldnn=self.use_mkldnn,
+            fuse_with_relu=self.fuse_with_relu,
             epsilon=epsilon)
 
         batch_norm_op.run(scope, place)
@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
     def setUp(self):
         self.dtype = np.float16
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def test_check_output(self):
@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 class TestBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
         self.init_kernel_type()
 
@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "epsilon": epsilon,
                         "is_test": False,
                         "data_layout": data_layout,
-                        "use_mkldnn": self.use_mkldnn
+                        "use_mkldnn": self.use_mkldnn,
+                        "fuse_with_relu": self.fuse_with_relu
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 4e1687477c6b89b34f0b35823f9587704a131e85..db5771f7b0ad74c73b81d502209c17dce3ce8457 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -20,44 +20,58 @@ from paddle.fluid.op import Operator
 
 
 class TestBeamSearchDecodeOp(unittest.TestCase):
+    """unittest of beam_search_decode_op"""
+
     def setUp(self):
         self.scope = core.Scope()
         self.place = core.CPUPlace()
 
     def append_lod_tensor(self, tensor_array, lod, data):
         lod_tensor = core.LoDTensor()
-        lod_tensor.set_recursive_sequence_lengths(lod)
+        lod_tensor.set_lod(lod)
         lod_tensor.set(data, self.place)
         tensor_array.append(lod_tensor)
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
-        self.append_lod_tensor(
-            ids, [[3, 3], [1, 1, 1, 1, 1, 1]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[3, 3], [1, 0, 2, 2, 0, 1]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[3, 3], [0, 1, 1, 1, 1, 1]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="int64"))
-
         scores = self.scope.var("scores").get_lod_tensor_array()
-        self.append_lod_tensor(
-            scores, [[3, 3], [1, 1, 1, 1, 1, 1]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[3, 3], [1, 0, 2, 2, 0, 1]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[3, 3], [0, 1, 1, 1, 1, 1]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="float64"))
+        # Construct sample data with 5 steps and 2 source sentences
+        # beam_size = 2, end_id = 1
+        # start with start_id
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 1, 2]], np.array(
+                    [0, 0], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 2, 4]],
+                np.array(
+                    [2, 3, 4, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                np.array(
+                    [3, 1, 5, 4], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                np.array(
+                    [1, 1, 3, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                np.array(
+                    [5, 1], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
@@ -69,18 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             Scores="scores",
             # outputs
             SentenceIds="sentence_ids",
-            SentenceScores="sentence_scores")
+            SentenceScores="sentence_scores",
+            beam_size=2,
+            end_id=1, )
 
         beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[4, 4], [1, 2, 3, 3, 1, 3, 3, 3]]
-        self.assertEqual(sentence_ids.recursive_sequence_lengths(),
-                         expected_lod)
-        self.assertEqual(sentence_scores.recursive_sequence_lengths(),
-                         expected_lod)
+        expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
 
         expected_data = np.array(
-            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+            [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
         self.assertTrue(
             np.array_equal(np.array(sentence_scores), expected_data))
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 5a14178c278c76b060b79facc041f0853d09c370..167451edd8c46c006c8019678a304a38f18cb946 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data):
 
 
 class BeamSearchOpTester(unittest.TestCase):
+    """unittest of beam_search_op"""
+
     def setUp(self):
         self.scope = core.Scope()
         self._create_ids()
+        self._create_pre_scores()
         self._create_scores()
         self._create_pre_ids()
         self.scope.var('selected_ids')
@@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase):
     def test_run(self):
         op = Operator(
             'beam_search',
-            pre_ids="pre_ids",
+            pre_ids='pre_ids',
+            pre_scores='pre_scores',
             ids='ids',
             scores='scores',
             selected_ids='selected_ids',
@@ -47,19 +51,31 @@ class BeamSearchOpTester(unittest.TestCase):
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
-        print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.recursive_sequence_lengths()
+        selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_scores),
+                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
+        self.assertEqual(selected_ids.lod(),
+                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
-        tensor = create_tensor(self.scope, "pre_ids", np_data)
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
 
     def _create_ids(self):
-        self.lod = [[1, 3], [1, 1, 1, 1]]
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
-        tensor.set_recursive_sequence_lengths(self.lod)
+        tensor.set_lod(self.lod)
 
     def _create_scores(self):
         np_data = np.array(
@@ -71,7 +87,7 @@ class BeamSearchOpTester(unittest.TestCase):
             ],
             dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
-        tensor.set_recursive_sequence_lengths(self.lod)
+        tensor.set_lod(self.lod)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 1a245fd756cb2bcaca720f10fa35fd3d2a45cd4d..d5bd726c4a82ee839703c69a933100bb056cb736 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -114,6 +114,23 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[300]]
+        dist = np.random.random((300, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': dist}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index ded2f130288a4a959a1c859b2cc8ccf0912efb12..07545e7feb46c85a4b80f9b846be27d36cbfb59a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups):
         self.op_type = "conv2d_transpose"
 
 
+class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 16, 16]  # NCHW
+        self.groups = 8
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [self.input_size[1], f_c, 4, 4]
+        self.op_type = "depthwise_conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2d57f7c5f127be87e963508e1dd150fdd30225
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+SEED = 1
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model(batch_size=20)
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGTERM)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(trainer_prog, feed=feeder.feed(data))
+
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[batch_acc, avg_cost])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = np.array(acc_set).mean()
+                    avg_loss_val = np.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.8:  # Smaller value to increase CI speed
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            assert ("got Nan loss, training failed.")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcdbfc8e527d0dc9a95eddaf040f8035207b6c20
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+
+
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 96d47906a0606bba4b1d2207f7da85b058e42a2b..fb9a496126f0b6efcad73590c78efe5a47b88cd6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -18,19 +18,23 @@ from op_test import OpTest
 
 
 class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.dtype = np.float32
         self.axis = -1
         self.init_dtype()
         self.init_input_output()
+        self.init_kernel_type()
         self.init_axis()
 
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(self.x),
             'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.attrs = {'axis': self.axis}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 82074955fae7514d556ba9319c11beb250c4de11..842d34c07e94a79e3351347e2528ecc478cc56dc 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -401,7 +401,7 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
-    def test_maxout(self):
+    def test_crop(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 5], dtype="float32")
@@ -410,6 +410,24 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_mean_iou(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32')
+            y = layers.data(name='label', shape=[1], dtype='int64')
+            iou = layers.mean_iou(x, y, 2)
+            self.assertIsNotNone(iou)
+        print(str(program))
+
+    def test_argsort(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+            self.assertIsNotNone(out)
+            self.assertIsNotNone(ids)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 1ea7a6a5682318fb5f4ef8b3a08911df3cd44acf..63fb58c6927fa387b3b19147b9dc9d24bb8e5132 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -169,9 +169,8 @@ class TestCRFModel(unittest.TestCase):
             data = train_data()
             for i in xrange(10):
                 cur_batch = next(data)
-                print map(np.array,
-                          pe.run(feed=feeder.feed(cur_batch),
-                                 fetch_list=[avg_cost.name]))[0]
+                print pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0]
 
     @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 79702475cca86ca22107d4b1824fda277dd83157..1f5d2f16773efb7537de85abec88344f8e0daa9f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase):
                     fetch_list.append(k)
 
             for data in train_inputs:
-                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                ret = pe.run(fetch_list,
+                             feed=feeder.feed(data),
+                             return_numpy=True)
                 for i in range(len(fetch_list)):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
@@ -128,7 +130,7 @@ class TestFeedParallel(unittest.TestCase):
             use_cuda=use_cuda, loss_name=loss.name, main_program=main)
 
         for batch_id, data in enumerate(reader()):
-            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
+            loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
             print batch_id, loss_np
             if batch_id == 2:
                 break
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 31ba8c1d6096c9c89e0695c8eca8e16a5e303a61..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 
 
 def simple_fc_net():
@@ -70,10 +72,17 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
             for i in xrange(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
-                test_loss = np.array(test_loss)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
-                train_loss = np.array(train_loss)
+
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 45ab889beaa1355d0e1e2922aedf0340f70809ba..b6e0241265b18377874efb0d223441994b4650d0 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -119,27 +119,20 @@ class CheckpointConfig(object):
                  max_num_checkpoints=3,
                  epoch_interval=1,
                  step_interval=10):
-        if checkpoint_dir is None:
-            self.checkpoint_dir = os.getcwd()
-        else:
-            self.checkpoint_dir = checkpoint_dir
-
-        self.max_num_checkpoints = max_num_checkpoints
-
-        if epoch_interval < 1:
-            self.epoch_interval = 1
-        else:
-            self.epoch_interval = epoch_interval
 
-        if step_interval < 1:
-            self.step_interval = 10
-        else:
-            self.step_interval = step_interval
+        assert epoch_interval >= 1
+        assert step_interval >= 1
 
+        self.checkpoint_dir = checkpoint_dir \
+            if checkpoint_dir is not None else os.getcwd()
+        self.max_num_checkpoints = max_num_checkpoints
+        self.epoch_interval = epoch_interval
+        self.step_interval = step_interval
         self.epoch_id = 0
         self.step_id = 0
         self.load_serial = None
-        self.is_pserver = False
+        self.pserver_id = None
+        self.lookup_table_name = None
 
 
 def check_and_get_place(place):
@@ -290,13 +283,20 @@ class Trainer(object):
                                    self.checkpoint_cfg.load_serial,
                                    self.startup_program)
 
-            if not self.checkpoint_cfg.is_pserver:
-                epoch_id, step_id = io.load_trainer_args(
-                    self.checkpoint_cfg.checkpoint_dir,
-                    self.checkpoint_cfg.load_serial, self.trainer_id,
-                    self._get_checkpoint_load_args())
-                self.checkpoint_cfg.epoch_id = int(epoch_id)
-                self.checkpoint_cfg.step_id = int(step_id)
+                if not self.checkpoint_cfg.pserver_id:
+                    epoch_id, step_id = io.load_trainer_args(
+                        self.checkpoint_cfg.checkpoint_dir,
+                        self.checkpoint_cfg.load_serial, self.trainer_id,
+                        self._get_checkpoint_load_args())
+                    self.checkpoint_cfg.epoch_id = int(epoch_id)
+                    self.checkpoint_cfg.step_id = int(step_id)
+                else:
+                    if self.checkpoint_cfg.lookup_table_name:
+                        io.load_lookup_table_vars(
+                            exe, self.checkpoint_cfg.checkpoint_dir,
+                            self.startup_program,
+                            self.checkpoint_cfg.pserver_id,
+                            self.checkpoint_cfg.lookup_table_name)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -315,7 +315,7 @@ class Trainer(object):
             for ip in worker_ips.split(","):
                 worker_endpoints.append(':'.join([ip, port]))
             self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("POD_IP") + ":" + port
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
             worker_endpoints.remove(current_endpoint)
             # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
             # in ParallelExecutor to start
@@ -366,7 +366,10 @@ class Trainer(object):
                 self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
                 if self.checkpoint_cfg:
-                    self.is_pserver = True
+                    pserver_id = eplist.index(current_endpoint)
+                    self.checkpoint_cfg.pserver_id = pserver_id
+                    if t.has_distributed_lookup_table:
+                        self.checkpoint_cfg.lookup_table_name = t.table_name
 
                 self.train_program = t.get_pserver_program(current_endpoint)
                 self.startup_program = t.get_startup_program(current_endpoint,
@@ -566,7 +569,8 @@ class Trainer(object):
     def _save_checkpoint(self, epoch_id, step_id):
         assert self.checkpoint_cfg
 
-        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
+            and step_id % self.checkpoint_cfg.step_interval == 0:
             exe = executor.Executor(self.place)
             io.save_checkpoint(
                 executor=exe,
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index bb61f82a9cf7f837f0403082165a2375d18b574e..343901cda3f505c3b3d2ed0c30cf7fea71c8b6b1 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -301,8 +301,8 @@ class DistributeTranspiler(object):
             Program: trainer side program.
         """
         # remove optimize ops and add a send op to main_program
+        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
-        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.origin_program.__str__()
         return self.origin_program
 
@@ -383,11 +383,12 @@ class DistributeTranspiler(object):
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
+                                   lr_ops):
             if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                          self.origin_program, merged_var)
-            else:
+            elif op not in lr_ops:
                 self._append_pserver_non_opt_ops(block, op)
 
         def __op_have_grad_input__(op):
@@ -452,7 +453,7 @@ class DistributeTranspiler(object):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
-                                           merged_var)
+                                           merged_var, lr_ops)
 
         # append global ops
         if global_ops:
@@ -461,7 +462,7 @@ class DistributeTranspiler(object):
             optimize_blocks.append(opt_state_block)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id, None)
+                                       grad_to_block_id, None, lr_ops)
 
         # process distributed lookup_table
         prefetch_var_name_to_block_id = []
@@ -471,6 +472,8 @@ class DistributeTranspiler(object):
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
+            checkpoint_block_id = self._create_checkpoint_save_block(
+                pserver_program, table_opt_block.idx)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
@@ -489,6 +492,7 @@ class DistributeTranspiler(object):
         if len(prefetch_var_name_to_block_id) > 0:
             attrs['prefetch_var_name_to_block_id'] \
                 = prefetch_var_name_to_block_id
+            attrs['checkpint_block_id'] = checkpoint_block_id
 
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
@@ -534,7 +538,6 @@ class DistributeTranspiler(object):
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
-            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
@@ -910,6 +913,27 @@ class DistributeTranspiler(object):
 
         return table_opt_block
 
+    def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
+        """
+        create a new block to handle save checkpoint.
+        """
+        import os
+
+        pserver_program.global_block().create_var(
+            name="kLookupTablePath",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        checkpoint_save_block = pserver_program.create_block(pre_block_idx)
+        # this 'file_path' do not be used in save lookup table variable
+        checkpoint_save_block.append_op(
+            type='save',
+            inputs={'X': [self.table_name]},
+            outputs={},
+            attrs={'file_path': "none"})
+
+        return checkpoint_save_block.idx
+
     def _create_vars_from_blocklist(self,
                                     program,
                                     block_list,
@@ -1299,16 +1323,6 @@ class DistributeTranspiler(object):
                     ufind.union(op1, op2)
         return ufind
 
-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
     def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
@@ -1399,7 +1413,10 @@ class DistributeTranspiler(object):
         params_grads = []
         origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            if self._is_opt_role_op(op):
+            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
+            # or not, because all ops in optimizer sub-graph would
+            # sign the optimizer op role
+            if self._is_optimizer_op(op):
                 opt_ops.append(op)
                 # HACK(wuyi): if we find grad vars from input of optimize
                 # ops, we may get the output of clip op. Use syntax "@GRAD"
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 0629f2916b339a6cd19ccadf435a67a17d6da4cc..d32c69d148dfa1633ce344611ca3fe7879a234e9 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import numpy as np
 from .. import core
 from ..framework import Program
@@ -20,12 +21,15 @@ from ..executor import global_scope
 
 class InferenceTranspiler:
     '''
-    Convert the fluid program to optimized inference program. 
-    
-    There are several optimizations, only fuse batch normalization is supported now.
+    Convert the fluid program to optimized inference program.
+
+    There are several optimizations:
+
+      - fuse convolution and batch normalization
+      - fuse batch normalization and relu (MKLDNN only)
 
     Examples:
-   
+
     .. code-block:: python
 
         # As InferenceTranspiler will modify the original program,
@@ -54,19 +58,64 @@ class InferenceTranspiler:
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
         self.fuse_batch_norm(program, place, scope)
+        self.fuse_relu_mkldnn(program)
+
+    def fuse_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+
+        Relu activation following batch norm OP can be fused by adding
+        :math:`fuse_with_relu` attribute to batch norm OP.
+
+        The result of fuse is:
+
+        - before:
+
+          - batch_norm->relu->any_other_op
+
+        - after:
+
+          - batch_norm->any_other_op
+
+        :param program: program to transpile
+        :type program: Program
+        '''
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if not use_mkldnn:
+            return
+
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 1:
+            current_op = self.block.ops[i]
+            if current_op.type in ['batch_norm']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op.set_attr("fuse_with_relu", True)
+                    # remove relu OP
+                    self.block.remove_op(i + 1)
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
 
     def fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
- 
-        The batch normalization followed the convolution or fully connected layer 
-        can be integrated with them. Doing so will give us a forward acceleration, 
+
+        The batch normalization followed the convolution or fully connected layer
+        can be integrated with them. Doing so will give us a forward acceleration,
         especially in environments like mobile or embedded.
-                    
+
         For input :math:`X`:
 
-        - Conv process:        :math:`X = input * W + bias` 
-        - Batch norm process:  :math:`X' = (X - mean) / std` 
+        - Conv process:        :math:`X = input * W + bias`
+        - Batch norm process:  :math:`X' = (X - mean) / std`
         - Scale Process:       :math:`Y = a * X' + b`
 
         After fuse into one operation:
@@ -76,17 +125,17 @@ class InferenceTranspiler:
             Y &= (input * W + bias - mean) / std * a + b \\\\
               &= input * a * W / std + ((bias - mean) / std * a + b)
 
-        The operator transformation is: 
+        The operator transformation is:
 
         - before:
 
           - conv->batch_norm->any_other_op (bias == 0)
           - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
-            
-        - after: 
+
+        - after:
 
           - conv->elementwise_add->any_other_op
-        
+
         The transpile stages are:
 
         1. insert elementwise_add op when bias == 0.
@@ -99,20 +148,20 @@ class InferenceTranspiler:
             program (Program): program to transpile
             place (Place): inference place
             scope (Scope): inference Scope
-        
+
         '''
         self.scope = scope
         self.place = place
         self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted 
+        self.input_map = {}  # store the input names should be adjusted
 
         i = 0
-        while i < len(self.block.ops):
+        while i < len(self.block.ops) - 2:
             current_op = self.block.ops[i]
             # TODO(luotao1): consider only conv2d now. fc would be delt later.
             if current_op.type in ['conv2d']:
-                # TODO(luotao1): consider single chain network now. 
-                # For branch network, we counldn't use block.ops[i + 1] as 
+                # TODO(luotao1): consider single chain network now.
+                # For branch network, we counldn't use block.ops[i + 1] as
                 # the judgment condition.
                 next_op = self.block.ops[i + 1]
                 # conv2d without bias
@@ -137,17 +186,17 @@ class InferenceTranspiler:
 
         self._adjust_input()
         self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force, 
-        # since some large program.desc will not be flushed immediately. 
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
         # And a better solution will be considered later.
         program = program.clone()
 
     # ====================== private transpiler functions =====================
     def _insert_bias_op(self, index, current_op, bn_op):
         '''
-        Construct elementwise_add operator for adding bias 
+        Construct elementwise_add operator for adding bias
         and insert it into program.
-        
+
         :param index: insert location of bias_op
         :type index: Int
         :param current_op: current operator (conv or fc)
@@ -175,14 +224,14 @@ class InferenceTranspiler:
     def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
         '''
         fuse the batch_norm_op' parameters to current_op (conv or fc)
-        
+
         :param current_op: current operator (conv or fc)
         :type current_op: Operator
         :param bn_op: batch norm operator
         :type bn_op: Operator
         :param bias_op: elementwise_add operator for adding bias
         :type bias_op: Operator
-        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
         :type with_bias: Int
         '''
 
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 9f675bed895223e054cd3bb6e504fe1607f19858..2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -112,7 +112,7 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):