Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/refactor_op

59992013 · yuyang18 · 9faf5a39 · 02e521e3 · 59992013 · 59992013
128 changed file
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,5 +122,9 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, uses inference transpiler to optimize the program.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    exe = fluid.Executor(place)
    exe.run(startup_prog)
+    # Use inference_transpiler to speedup
+    if args.use_inference_transpiler:
+        t = fluid.InferenceTranspiler()
+        t.transpile(infer_prog, place)
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
    endforeach()
 endfunction()
-# download library
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
-message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    # download library
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()
 if (WITH_ANAKIN)
    message(STATUS "Anakin for inference is enabled")

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -149,21 +149,33 @@ copy(memory_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )
-set(module "inference")
+set(inference_deps paddle_fluid_shared paddle_fluid)
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
-)
 if(WITH_CONTRIB)
-   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    message(STATUS "installing contrib")
-   copy(contrib_inference_lib DEPS paddle_inference_api
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+  copy(contrib_inference_lib DEPS paddle_inference_api
        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
-   )
+  list(APPEND inference_deps contrib_inference_lib)
 endif()
+set(module "inference")
+copy(inference_lib DEPS ${inference_deps}
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h

--- a/doc/about/about_us.rst
+++ b/doc/about/about_us.rst
+=========
+关于我们
+=========
+什么是PaddlePaddle
+--------------------
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+PaddlePaddle的技术特色
+-------------------------
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+提供基于PaddlePaddle的教育体系
+--------------------------------
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+提供基于PaddlePaddle的AI服务
+------------------------------
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+PaddlePaddle团队
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
 ## Slicing of LoD Tensors
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
  ||
 ```
+## Length Representation vs Offset Representation
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
+# Python Data Feeding
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+```C++
+class LoDTensorBlockingQueueHolder;
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+  void Close() { return queue_.Close(); }
+  bool IsClosed() const { return queue_.IsClosed(); }
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  void ReInit() override { return; }
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
-## 堆内存分析和优化
+# 堆内存分析和优化
 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
-## 使用流程
+## 环境
-#### 环境
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
-#### 使用流程
+## 使用流程
 - 安装google-perftools

--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
+# 如何使用timeline工具做性能分析
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+	![chrome tracing](./tracing.jpeg)
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+	![chrome timeline](./timeline.jpeg)
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -19,6 +19,9 @@ endif(APPLE)
 set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
@@ -50,17 +53,30 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
-if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+if (WITH_ANAKIN) # only needed in CI
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
    # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
+    if (WITH_TESTING)
+        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                  DEPS inference_anakin_api)
-    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+     endif(WITH_TESTING)
 endif()
 if(WITH_TESTING)

--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -14,3 +14,48 @@
 #
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
+set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
+function(inference_download_test_demo TARGET)
+    if (NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs URL)
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
+    message(STATUS "inference demo ${test_dir}")
+    if(NOT EXISTS "${test_dir}")
+        message(STATUS "Download ${TARGET} model from ${tests_URL}")
+        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
+    endif()
+    cc_test(${TARGET} SRCS "${tests_SRCS}"
+        DEPS paddle_inference_api paddle_fluid
+        ARGS --data=${test_dir}/data.txt
+             --modeldir=${test_dir}/model
+             --refer=${test_dir}/result.txt)
+endfunction()
+# disable mobilenet test
+#inference_download_test_demo(mobilenet_inference_demo
+#    SRCS vis_demo.cc
+#    URL ${URL_ROOT}mobilenet.tar.gz)
+inference_download_test_demo(se_resnext50_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}se_resnext50.tar.gz)
+inference_download_test_demo(ocr_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}ocr.tar.gz)
--- a/paddle/contrib/inference/demo/README.md
+++ b/paddle/contrib/inference/demo/README.md
+# Infernce Demos
+Input data format:
+- Each line contains a single record
+- Each record's format is
+```
+<space splitted floats as data>\t<space splitted ints as shape>
+```
+Follow the C++ codes in `vis_demo.cc`.
+## MobileNet
+To execute the demo, simply run
+```sh
+./mobilenet_inference_demo --modeldir <model> --data <datafile>
+```
+## SE-ResNeXt-50
+To execute the demo, simply run
+```sh
+./se_resnext50_inference_demo --modeldir <model> --data <datafile>
+```
+## OCR
+To execute the demo, simply run
+```sh
+./ocr_inference_demo --modeldir <model> --data <datafile>
+```
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 namespace paddle {
 namespace demo {

--- a/paddle/contrib/inference/demo/utils.h
+++ b/paddle/contrib/inference/demo/utils.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+namespace demo {
+static void split(const std::string& str,
+                  char sep,
+                  std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+/*
+ * Get a summary of a PaddleTensor content.
+ */
+static std::string SummaryTensor(const PaddleTensor& tensor) {
+  std::stringstream ss;
+  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+  ss << "data[:10]\t";
+  switch (tensor.dtype) {
+    case PaddleDType::INT64: {
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<float*>(tensor.data.data())[i] << " ";
+      }
+      break;
+  }
+  return ss.str();
+}
+}  // namespace demo
+}  // namespace paddle
--- a/paddle/contrib/inference/demo/vis_demo.cc
+++ b/paddle/contrib/inference/demo/vis_demo.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file contains demo for mobilenet, se-resnext50 and ocr.
+ */
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/contrib/inference/demo/utils.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+namespace paddle {
+namespace demo {
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data,
+    "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+Record ProcessALine(const std::string& line) {
+  LOG(INFO) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  LOG(INFO) << "data size " << record.data.size();
+  LOG(INFO) << "data shape size " << record.shape.size();
+  return record;
+}
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  LOG(INFO) << "predictor output numel " << numel;
+  LOG(INFO) << "reference output numel " << refer.data.size();
+  EXPECT_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_NEAR(
+            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+      }
+      break;
+  }
+}
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main(bool use_gpu) {
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+#ifdef PADDLE_WITH_CUDA
+  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
+#endif
+  LOG(INFO) << "init predictor";
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  LOG(INFO) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+  // Inference.
+  PaddleTensor input{
+      .name = "xx",
+      .shape = record.shape,
+      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
+      .dtype = PaddleDType::FLOAT32};
+  LOG(INFO) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output);
+  LOG(INFO) << "output.size " << output.size();
+  auto& tensor = output.front();
+  LOG(INFO) << "output: " << SummaryTensor(tensor);
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
+#endif
+}  // namespace demo
+}  // namespace paddle
--- a/paddle/contrib/inference/high_level_api_cn.md
+++ b/paddle/contrib/inference/high_level_api_cn.md
+# Paddle 预测 API
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+预测库包含:
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
+下面是详细的一些 API 概念介绍
+## PaddleTensor
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+## engine
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
+- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
+- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+其实现为
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,       // Use the native Fluid facility.
+  kAnakin,           // Use Anakin for inference.
+  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+};
+```
+## 预测部署过程
+总体上分为以下步骤
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+下面完整演示一个简单的模型，部分细节代码隐去
+```c++
+#include "paddle_inference_api.h"
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
+## 详细代码参考
+- [inference demos](./demo)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -16,6 +16,19 @@ limitations under the License. */
 namespace paddle {
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      //
+      assert(false);
+      return -1;
+  }
+}
 PaddleBuf::PaddleBuf(PaddleBuf&& other)
    : data_(other.data_),
      length_(other.length_),
@@ -62,4 +75,4 @@ void PaddleBuf::Free() {
  }
 }
 }  // namespace paddle
\ No newline at end of file
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -15,7 +15,7 @@ limitations under the License. */
 /*
 * This file contains the definition of a simple Inference API for Paddle.
 *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
 * might release another API.
 */
@@ -73,12 +73,12 @@ struct PaddleTensor {
 };
 enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
+  kNative = 0,         // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
  // TODO(Superjomn) support following engines latter.
  // kTensorRT,           // Use TensorRT for inference.
  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
 };
 /*
@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
  int max_batch_size{-1};
 };
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
@@ -140,4 +145,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+int PaddleDtypeSize(PaddleDType dtype);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
    LOG(ERROR) << "fail to load inference model.";
    return false;
  }
  ctx_ = executor_->Prepare(*inference_program_, 0);
  executor_->CreateVariables(
      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
    feed_targets[feed_target_names_[i]] = &feeds[i];
  }
  // get fetch variable
@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  }
  // Run the inference program
  // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(
      ctx_.get(),
      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
      &feed_targets,
      &fetch_targets,
      false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
  if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetchs";
+    LOG(ERROR) << "fail to get fetches";
    return false;
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";

--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
  ~NativePaddlePredictor() override;
- private:
+ protected:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               std::vector<framework::LoDTensor> *feeds);
  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,

--- a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+namespace paddle {
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+    executor_.reset(new paddle::framework::Executor(place_));
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+ private:
+  TensorRTConfig config_;
+};
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+}  // namespace paddle
--- a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+DEFINE_string(dirname, "", "Directory of the inference model.");
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -486,6 +486,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
    }
  } else if (op.Type() == "concat") {
    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
  } else {
    PADDLE_ENFORCE(
        "the distribute training related op should be in [split_byref, "

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,7 +30,7 @@ class SSAGraphBuilder {
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
+  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);

--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -16,6 +16,8 @@
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
    return graph;
  }
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
  bool IsValidGraph(const SSAGraph* graph) const;
 private:

--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <iosfwd>
+#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 namespace paddle {
@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
    return graph;
  }
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
 private:
  std::unique_ptr<SSAGraphPrinter> printer_;
  std::unique_ptr<SSAGraphBuilder> builder_;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -748,6 +748,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &var->Get<LoDTensor>();
        } else if (var->IsType<SelectedRows>()) {
          t = &(var->Get<SelectedRows>().value());
+        } else if (var->IsType<LoDTensorArray>()) {
+          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
+          PADDLE_ENFORCE(arr.size() > 0);
+          t = &(arr[0]);
        }
        if (t != nullptr) {
          int tmp = static_cast<int>(ToDataType(t->type()));

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor(
 void ParallelExecutor::BCastParamsToGPUs(
    const std::unordered_set<std::string> &vars) const {
-  // the the initialize bcast, all vars would be bcast from device(0), otherwise
+  // the the initializing bcast, all vars would be bcast from device(0),
+  // otherwise
  // bcast from the specified device.
-  bool initialize = builder_.get() == nullptr ? true : false;
+  bool initializing = builder_.get() == nullptr ? true : false;
  for (auto &var : vars) {
    int var_dev_id =
        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
-    if (!initialize && var_dev_id == -1) continue;
+    if (!initializing && var_dev_id == -1) continue;
    framework::Variable *main_var = nullptr;
-    if (initialize) {
+    if (initializing) {
      main_var = member_->local_scopes_[0]->FindVar(var);
    } else {
      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs(
        auto place = member_->places_[i];
        void *buffer;
-        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id)) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
@@ -181,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+          if (initializing) {
-                                       nccl_ctx.comm_, nccl_ctx.stream());
+            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                         nccl_ctx.comm_, nccl_ctx.stream());
+          } else {
+            if (var_dev_id >= 0) {
+              platform::dynload::ncclBcast(buffers[i], numel, data_type,
+                                           var_dev_id, nccl_ctx.comm_,
+                                           nccl_ctx.stream());
+            }
+          }
        }
        member_->nccl_ctxs_->WaitAll();
      }
@@ -243,6 +253,9 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
      t->set_lod(lod_tensors[j].lod());
    }
  }
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
 }
 ParallelExecutor::~ParallelExecutor() {

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -28,9 +28,10 @@ endif()
 if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
-  add_subdirectory(analysis)
 endif()
+add_subdirectory(analysis)
 if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
  fluid_to_data_flow_graph_pass.cc
  data_flow_graph_to_fluid_pass.cc
-  tensorrt_subgraph_pass.cc
  dfg_graphviz_draw_pass.cc
-  DEPS framework_proto)
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 function (inference_analysis_test TARGET)
-    set(options "")
+    if(WITH_TESTING)
-    set(oneValueArgs "")
+        set(options "")
-    set(multiValueArgs SRCS)
+        set(oneValueArgs "")
-    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_test(${TARGET}
+        cc_test(${TARGET}
-            SRCS "${analysis_test_SRCS}"
+                SRCS "${analysis_test_SRCS}"
-            DEPS analysis
+                DEPS analysis
-            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
-    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
@@ -28,5 +32,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+  void Run(Argument* argument);
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -41,6 +41,9 @@ struct Argument {
  // The original program desc.
  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
 };
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-// It is a better idea that the inputs and outputs of this graph is set manully
+// It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
      outputs.push_back(out);
    }
  }
+  Clean();
+}
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
 }
 std::string DataFlowGraph::DotString() const {

--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -47,6 +47,10 @@ struct DataFlowGraph {
  // Output a DOT graph file for debug.
  std::string DotString() const;
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
 };
 /*
@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-std::pair<
+static std::pair<std::vector<Node *>, std::vector<Node *>>
-    std::vector<Node *>,
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
-    std::vector<
-        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
-                                                            &graph) {
  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
  std::unordered_set<Node *> inputs;
  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
  for (auto &node : graph) {
    for (auto *in : node->inlinks) {
-      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
        inputs.insert(in);
      }
    }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,21 +13,34 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
+using framework::proto::ProgramDesc;
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
 bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  desc_ = argument->origin_program_desc.get();
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
-  // Here some logic from program_desc.cc and will not add new interfaces into
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
-  // framework::ProgramDesc class, use some UT to assure the correctness.
+  // from the original program desc. The operators of the main block(the first
-  auto* block = desc_->mutable_blocks()->Add();
+  // block) should rewritten by data flow graph.
-  block->set_idx(framework::kRootBlockIndex);
+  argument->transformed_program_desc.reset(
-  block->set_parent_idx(framework::kNoneBlockIndex);
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
  return true;
 }
@@ -37,14 +50,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
  auto traits = GraphTraits<DataFlowGraph>(graph);
  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
    if (it->deleted()) continue;
    switch (it->type()) {
-      case Node::Type::kFunction:
+      case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->name();
+        LOG(INFO) << "add function " << it->repr();
        AddFluidOp(&(*it));
-        break;
+      } break;
-      case Node::Type::kFunctionBlock:
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
        AddEngineOp(&(*it));
-        break;
+      } break;
      default:
        continue;
    }
@@ -52,12 +68,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
 }
 void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  LOG(INFO) << "processing func " << node->name();
  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
  // currently only the main block is analyzed.
  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto* op = main_block->add_ops();
-  LOG(INFO) << "to copy the op";
  *op = *ori_op;  // copy the attributes, by default, these will not be changed
                  // by analysis phrase.
  // The inputs and outputs of the existing ops are not changed by tensorrt
@@ -65,11 +79,89 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
  // NOTE It might be changed by other passes in the long run.
 }
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
 void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
-  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
-  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  // auto* op = main_block->add_ops();
  // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -40,10 +40,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
    return "Transform a DFG to a Fluid ProgramDesc";
  }
-  Pass *CreatePrinterPass(std::ostream &os,
+  Pass *CreateGraphvizDebugerPass() const override;
-                          const std::string &banner) const override {
-    return nullptr;
-  }
 protected:
  // Add a Fluid Op into the ProgramDesc.
@@ -53,6 +50,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 private:
  framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
 };
 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -18,12 +18,19 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+int DFG_GraphvizDrawPass::counter_{0};
 void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
  auto content = Draw(graph);
-  std::ofstream file(GenDotPath());
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
  file.write(content.c_str(), content.size());
  file.close();
-  LOG(INFO) << "draw dot to " << GenDotPath();
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
    if (!config_.display_deleted_node && node.deleted()) continue;
    for (auto &in : node.inlinks) {
      if (!config_.display_deleted_node && in->deleted()) continue;
-      for (auto &in : node.inlinks) {
+      dot.AddEdge(in->repr(), node.repr(), {});
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
    }
  }
  return dot.Build();

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -50,20 +50,25 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
  bool Initialize(Argument *argument) override { return true; }
  void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return Pass::Finalize(); }
+  bool Finalize() override { return true; }
  std::string repr() const override { return "DFG graphviz drawer"; }
  std::string description() const override {
    return "Debug a DFG by draw with graphviz";
  }
- private:
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
  // Path of the dot file to output.
  std::string GenDotPath() const {
-    return config_.dir + "/" + "graph_" + config_.id + ".dot";
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
  }
-  std::string Draw(DataFlowGraph *graph);
+  virtual std::string Draw(DataFlowGraph *graph);
  Config config_;
 };

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
  pass.Run(&dfg);
  // test content
-  std::ifstream file("./graph_test.dot");
+  std::ifstream file("./0-graph_test.dot");
  ASSERT_TRUE(file.is_open());
  std::string line;
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
    no++;
  }
  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 112);
+  ASSERT_EQ(no, 82);
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 namespace paddle {
@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
  return true;
 }
-bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
  PADDLE_ENFORCE(graph);
@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    auto *v = graph->nodes.Create(Node::Type::kValue);
    v->SetName(var.name());
    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
    var2id[var.name()] = v->id();
  }
  for (int i = 0; i < main_block.ops_size(); i++) {
@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    // Link to the original protobuf message's memory, make it easier to
    // generate from a data flow graph to fluid ProgramDesc.
    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
    // set inputs and outputs
    // TODO(Superjomn) make sure the InputNames is the real variable name.
    for (int j = 0; j < op.inputs_size(); j++) {
@@ -79,9 +84,19 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
  graph->Build();
 }
-Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
+namespace {
-    std::ostream &os, const std::string &banner) const {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
-  return nullptr;
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
    return "transform a fluid ProgramDesc to a data flow graph.";
  }
-  Pass *CreatePrinterPass(std::ostream &os,
+  Pass *CreateGraphvizDebugerPass() const override;
-                          const std::string &banner) const override;
 private:
  framework::proto::ProgramDesc const *desc_;

--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,10 +14,12 @@ limitations under the License. */
 #pragma once
+#include <cstdio>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -26,6 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
 template <typename Vec>
 int AccuDims(Vec &&vec, int size) {
  int res = 1;
@@ -93,7 +99,7 @@ template <typename T>
 class OrderedRegistry {
 public:
  T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
    dic_[name] = data_.size();
    data_.emplace_back(std::unique_ptr<T>(x));
    return data_.back().get();
@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
  return *var->GetMutable<T>();
 }
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,6 +20,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_hash_ = typeid(std::string).hash_code();
+  }
+  PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
+  return data_;
+}
+std::string &NodeAttr::String() { return As<std::string>(); }
 std::vector<Dot::Attr> Value::dot_attrs() const {
  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                 Dot::Attr("shape", "box"),

--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -35,6 +35,44 @@ namespace analysis {
 class NodeMap;
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_hash_ = typeid(T).hash_code();
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_hash_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+ private:
+  std::string data_;
+  size_t type_hash_{std::numeric_limits<size_t>::max()};
+};
 /*
 * Node Representation.
 *
@@ -50,8 +88,6 @@ class Node {
  Node() = default;
-  struct Attr;
  // Cast to a subclass type, Function for example.
  template <typename Subclass>
  Subclass &As() {
@@ -71,7 +107,7 @@ class Node {
  // Get an additional attribute and convert it to T data type. NOTE this will
  // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) const { return attrs_[name]; }
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
  int id() const { return id_; }
@@ -80,6 +116,9 @@ class Node {
  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
  void *pb_desc() const { return attr("pb_desc").Pointer(); }
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
  void SetDeleted() { deleted_ = true; }
  bool deleted() const { return deleted_; }
@@ -94,43 +133,6 @@ class Node {
  // Output links.
  std::vector<Node *> outlinks;
-  // A helper class to maintain the status from Pass.
-  struct Attr {
-    // NOTE T should be a primary type or a struct combined by several primary
-    // types.
-    // NOTE the STL containers should not use here.
-    // Some usages
-    //   Attr attr;
-    //   attr.Bool() = true;
-    bool &Bool() { return As<bool>(); }
-    float &Float() { return As<float>(); }
-    int32_t &Int32() { return As<int32_t>(); }
-    int64_t &Int64() { return As<int64_t>(); }
-    void *&Pointer() { return As<void *>(); }
-   private:
-    template <typename T>
-    T &As() {
-      // init storage in the first usage.
-      if (data_.empty()) {
-        VLOG(4) << "resize data to " << sizeof(T);
-        type_hash_ = typeid(T).hash_code();
-        data_.resize(sizeof(T));
-      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
-                     "type not matched, origin is %s, want %s",
-                     DataTypeNamer::Global().repr(type_hash_),
-                     DataTypeNamer::Global().repr<T>());
-      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-      return *reinterpret_cast<T *>(&data_[0]);
-    }
-   private:
-    std::string data_;
-    size_t type_hash_{std::numeric_limits<size_t>::max()};
-  };
  // Type checks.
  bool IsFunction() const { return type_ == Node::Type::kFunction; }
  bool IsValue() const { return type_ == Node::Type::kValue; }
@@ -150,7 +152,7 @@ class Node {
  Type type_{Type::kNone};
  // Mark this node is deleted by some pass.
  bool deleted_{false};
-  mutable std::unordered_map<std::string, Attr> attrs_;
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
 };
 class Function;
@@ -213,6 +215,10 @@ class Function : public Node {
 struct FunctionBlock : public Node {
  std::string repr() const override { return "block-" + std::to_string(id()); }
  std::vector<Node *> subgraph;
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
 };
 class NodeMap {
@@ -227,7 +233,7 @@ class NodeMap {
  void Delete(size_t id);
-  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
  size_t size() const { return nodes_.size(); }

--- a/paddle/fluid/inference/analysis/node_attr_flags.h
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -60,6 +60,9 @@ class Pass {
    return nullptr;
  }
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
  // Run on a single Node.
  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
  // Run on a single Function.

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -19,6 +19,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
 void DfgPassManager::RunAll() {
  PADDLE_ENFORCE(argument_);
  for (auto& pass : data_) {

--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
  // globally shared, so pass them as the arguemnts for all the pass managers.
  virtual bool Initialize(const Argument& argument) { return false; }
-  virtual bool Initialize(Argument* argument) {
+  virtual bool Initialize(Argument* argument);
-    argument_ = argument;
-    for (auto& pass : data_) {
-      LOG(INFO) << "Initializing pass " << pass->repr();
-      if (!pass->Initialize(argument)) {
-        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
  // Call all the passes' Finalize methods.
  virtual bool Finalize() {

--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
+  ASSERT_TRUE(&argument);
  ASSERT_TRUE(manager.Initialize(&argument));
  manager.RunAll();
 }

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
    // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
    block_node->inlinks = std::move(io.first);
    block_node->outlinks = std::move(io.second);
@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
      // pass.
      node->SetDeleted();
+      block_node->subgraph.push_back(node);
    }
-    std::unordered_map<Node *, Node *>
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
-        delelte_node_map;  // deleted node to BlockNode
+    // outlink to this sub-graph node.
-    for (auto *n : block_node->inlinks) {
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
-      n->inlinks.clear();
+      for (auto *&n : nodes) {
-    }
+        if (subgraph_uniq.count(n)) {
-    for (auto *n : block_node->outlinks) {
+          n = block_node;
-      n->outlinks.clear();
+        }
-    }
+      }
-    for (auto *n : block_node->inlinks) {
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
-      n->outlinks.push_back(block_node);
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
    }
-    for (auto *n : block_node->outlinks) {
+    for (auto *&o : block_node->outlinks) {
-      n->inlinks.push_back(n);
+      inlink_or_outlink_cleaner(o->inlinks);
    }
  }
 }

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+  bool Finalize() override { return true; }
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+  TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
+  bool Initialize(Argument* argument) override { return true; }
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+  std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const { return "tensorrt sub-graph mark pass"; }
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+ private:
+  teller_t teller_;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+  LOG(INFO) << counter << " nodes marked";
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_);
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -38,6 +38,11 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
  // sub-graph into TensorRT.
  void Run(DataFlowGraph* graph) override;
+  bool Finalize() override { return true; }
+  std::string repr() const { return "tensorrt-sub-graph"; }
+  std::string description() const { return "tensorrt sub graph pass"; }
 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
 };

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -23,49 +23,48 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-DEFINE_string(model_dir, "", "inference test model dir");
+DEFINE_string(dot_dir, "./", "");
-TEST(TensorRTSubGraph, single_pass) {
+TEST_F(DFG_Tester, tensorrt_single_pass) {
-  auto desc = LoadProgramDesc();
+  std::unordered_set<std::string> teller_set(
-  auto dfg = ProgramDescToDFG(desc);
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
    if (node->type() != Node::Type::kFunction) return false;
    const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+    if (teller_set.count(func->func_type())) return true;
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
    return false;
  };
-  DFG_GraphvizDrawPass::Config config{"./", "test"};
+  LOG(INFO) << "init";
-  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
-  dfg_pass.Initialize();
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
-  DFG_GraphvizDrawPass dfg_pass1(config);
-  dfg_pass1.Initialize();
-  dfg_pass.Run(&dfg);
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
  TensorRTSubGraphPass trt_pass(std::move(teller));
-  trt_pass.Initialize();
-  trt_pass.Run(&dfg);
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
-  dfg_pass1.Run(&dfg);
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
  // Check the TRT op's block desc
-  for (auto node : dfg.nodes.nodes()) {
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
    if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
    }
  }
 }
-TEST(TensorRTSubGraph, pass_manager) {}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
    op_library(tensorrt_engine_op DEPS tensorrt_engine)
    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
 else()
    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()

--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                      "Beta2 power accumulator should have 1 dimension");
    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
+    if (ctx->GetInputsVarType("Grad")[0] ==
-        param_dims, ctx->GetInputDim("Grad"),
+        framework::proto::VarType::LOD_TENSOR) {
-        "Param and Grad input of AdamOp should have same dimension");
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment1"),
        "Param and Moment1 input of AdamOp should have same dimension");

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
    } else if (grad_var->IsType<framework::SelectedRows>()) {
      auto& grad =
          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
      // merge duplicated rows if any.
      scatter::MergeAdd<DeviceContext, T> merge_func;
      auto grad_merge =

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,28 +19,28 @@ namespace operators {
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
-  *num_updates_ = in_num_updates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
 }
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    const framework::ExecutionContext& ctx, int64_t num_updates,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+    int64_t num_accumulates, int64_t old_num_accumulates) {
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
-  out_num_updates->data<int64_t>()[0] = num_updates_;
+  out_num_updates->data<int64_t>()[0] = num_updates;
 }
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+Accumulate the sum of parameter within sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.

--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    float average_window = ctx.Attr<float>("average_window");
    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    min_average_window =
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-        std::min<int64_t>(min_average_window, max_average_window);
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
    // Get inputs
    auto* param = ctx.Input<Tensor>("param");

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
    const auto *x = ctx.Input<Tensor>("X");
    const auto *mean = ctx.Input<Tensor>("Mean");
@@ -111,6 +112,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;
+    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
    // create mkldnn memory from input x tensor
    auto src_memory =

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("use_mkldnn",
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false);
+    AddAttr<bool>("fuse_with_relu",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Batch Normalization.

--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <algorithm>
 #include <string>
+#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
@@ -22,8 +24,11 @@ namespace operators {
 struct BeamSearchDecodeFunctor {
  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
-      : step_ids_origin_(step_ids),
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_origin_(step_ids),
        step_scores_origin_(step_scores),
        id_tensor_(id_tensor),
        score_tensor_(score_tensor) {
@@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor {
      // Copy all tensors in the input tensor array
      for (auto& step_id : step_ids_origin_) {
        framework::LoDTensor out;
-        dev_ctx->Wait();
+        if (step_id.numel() > 0) {
-        framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
-        dev_ctx->Wait();
+          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
+        }
        out.set_lod(step_id.lod());
        step_ids_.push_back(out);
@@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor {
      // Copy all tensors in the input tensor array
      for (auto& step_score : step_scores_origin_) {
        framework::LoDTensor out;
-        dev_ctx->Wait();
+        if (step_score.numel() > 0) {
-        framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
-        dev_ctx->Wait();
+          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
+                                &out);
+          dev_ctx->Wait();
+        }
        out.set_lod(step_score.lod());
        step_scores_.push_back(out);
@@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor {
  void operator()() const;
  bool tensor_on_gpu_;
+  size_t beam_size_;
+  int end_id_;
  const LoDTensorArray& step_ids_origin_;
  const LoDTensorArray& step_scores_origin_;
  LoDTensorArray step_ids_ = LoDTensorArray();
@@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor {
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
  // Check if the tensor is on GPU. If so, use the CPU copy instead
  if (tensor_on_gpu_) {
-    beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
-                                     score_tensor_);
+                                  score_tensor_);
  } else {
-    beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
+    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
-                                     id_tensor_, score_tensor_);
+                                  id_tensor_, score_tensor_);
  }
 }
@@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                        "Level of LodTensor should be 2");
    }
+    size_t beam_size = ctx.Attr<int>("beam_size");
+    int end_id = ctx.Attr<int>("end_id");
    // prepare output
    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
    framework::VisitDataType(
        framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
+                                beam_size, end_id));
  }
 };
@@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("Ids",
             "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected ids of all steps");
    AddInput("Scores",
             "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected scores of all steps");
-    AddOutput("SentenceIds",
+    AddOutput(
-              "(LodTensor)"
+        "SentenceIds",
-              "All possible result sentences of word ids");
+        "(LodTensor)"
-    AddOutput("SentenceScores",
+        "An LodTensor containing all generated id sequences for all source "
-              "(LodTensor)"
+        "sentences");
-              "All possible result sentences of word scores");
+    AddOutput(
+        "SentenceScores",
+        "(LodTensor)"
+        "An LodTensor containing scores corresponding to Output(SentenceIds)");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
    AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
+Beam Search Decode Operator. This Operator constructs the full hypotheses for
+each source sentence by walking back along the LoDTensorArray Input(ids)
+whose lods can be used to restore the path in the beam search tree.
+The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+generated id sequences and the corresponding scores. The shapes and lods of the 
+two LodTensor are same. The lod level is 2 and the two levels separately 
+indicate how many hypotheses each source sentence has and how many ids each 
+hypothesis has.
 )DOC");
  }
 };
@@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  void operator()(const framework::OpDesc& op_desc,
                  framework::BlockDesc* block) const override {
    for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
+      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
    }
    for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
    }
  }
 };

--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor;
 using LoDTensorArray = framework::LoDTensorArray;
 // all the lod have 2 levels.
-// The First is source level, the second is sentence level.
+// The first is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
+// source level describe how many prefixes (branchs) for each source sentece
-// sentence level describe these candidates belong to which prefix
+// (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
 template <typename T>
 struct Sentence {
  std::vector<int64_t> word_ids;
@@ -72,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>;
 template <typename T>
 struct BeamSearchDecoder {
-  /**
+  BeamSearchDecoder(size_t beam_size, int end_id)
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+      : beam_size_(beam_size), end_id_(end_id) {}
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>* prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
  /**
   * convert the result sentence_vector for each source sentence into two
@@ -100,107 +56,30 @@ struct BeamSearchDecoder {
   *  sentence_vector_list: sentence_vector for each source sentence.
   *  id_tensor: result LoDTensor for sentences of id.
   *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
   */
  void ConvertSentenceVectorToLodTensor(
      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
+      LoDTensor* score_tensor, bool reverse = true,
+      bool sort_by_score = true) const;
  /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * Gather the hypotheses for each source sentence by backtrace though the
-   * it's main logic is:
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
   */
-  void PackAllSteps(const LoDTensorArray& step_ids,
+  void Backtrace(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
+                 LoDTensor* score_tensor) const;
-};
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-  return sentence;
-}
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>* prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+  size_t beam_size_;
-       ++src_idx) {
+  int end_id_;
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+};
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-    BeamNodeVector<T> beam_nodes;
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list->empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
  size_t src_num = sentence_vector_list.size();
  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
@@ -211,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
  std::vector<T> score_data;
  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+      if (reverse) {
-                     sentence.word_ids.end());
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
-      score_data.insert(score_data.end(), sentence.scores.begin(),
+                       sentence.word_ids.rend());
-                        sentence.scores.end());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
      sentence_level_lod.push_back(sentence_level_lod.back() +
                                   sentence.word_ids.size());
    }
@@ -243,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 }
 template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
+                                     const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
+                                     LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
+                                     LoDTensor* score_tensor) const {
  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
                    "step_ids and step_scores should be the same");
  const size_t step_num = step_ids.size();
  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
-  // previous prefixes for each step,
+            prefix_idx++;
-  // the init length is 0, means this is the first step.
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
-  // pack all steps for one batch first, then another batch
+        }
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+      }
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     &beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
    }
  }
  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
+                                   score_tensor, true, true);
 }
 }  // namespace operators

--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorArray = paddle::framework::LoDTensorArray;
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
 template <typename T>
 using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
 template <typename T>
 using Sentence = paddle::operators::Sentence<T>;
 template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
 using SentenceVector = paddle::operators::SentenceVector<T>;
 namespace paddle {
@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
 }  // namespace test
 }  // namespace paddle
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+TEST(BeamSearchDecodeOp, Backtrace) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-  delete b3;
-  delete b2;
-}
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-TEST(BeamSearchDecodeOp, PackAllSteps) {
  CPUPlace place;
-  // we will constuct a sample data with 3 steps and 2 source sentences
+  // Construct sample data with 5 steps and 2 source sentences
+  // beam_size = 2, start_id = 0, end_id = 1
  LoDTensorArray ids;
  LoDTensorArray scores;
  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
+  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
+                                std::vector<size_t>{0, 2, 4},
+                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 2, 2, 4, 4},
+                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 1, 2, 3, 4},
+                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<size_t>{0, 2, 4},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+      std::vector<size_t>{0, 0, 0, 2,
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                          2},  // the branchs of the first source sentence
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                               // are pruned since finished
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+      std::vector<int>{5, 1},
+      &ids, &scores);
-  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(ids.size(), 5UL);
-  ASSERT_EQ(scores.size(), 3UL);
+  ASSERT_EQ(scores.size(), 5UL);
-  BeamSearchDecoder<float> helper;
+  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
  LoDTensor id_tensor;
  LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
  LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  std::vector<size_t> expect_source_lod = {0, 2, 4};
  EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
  EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  5, 3, 5, 0, 4, 5, 3, 1};
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
  for (size_t i = 0; i < expect_data.size(); ++i) {
    ASSERT_EQ(id_tensor.data<int64_t>()[i],

--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/beam_search_op.h"
 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 namespace paddle {
 namespace operators {
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            const framework::LoDTensor &pre_scores,
                            framework::LoDTensor *selected_ids,
                            framework::LoDTensor *selected_scores) {
  auto abs_lod = framework::ToAbsOffset(ids_->lod());
  auto &high_level = abs_lod[lod_level_];
-  auto items = SelectTopBeamSizeItems();
+  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
  auto selected_items = ToMap(items, high_level.back());
  VLOG(3) << "selected_items:";
  for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
      VLOG(3) << ItemToString(item);
    }
  }
-  PruneEndidCandidates(pre_ids, &selected_items);
+  PruneEndBeams(pre_ids, &selected_items);
  // calculate the output tensor's height
  size_t num_instances = std::accumulate(
      std::begin(selected_items), std::end(selected_items), 0,
@@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  size_t low_offset = 0;
  for (auto &items : selected_items) {
    low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
    for (auto &item : items) {
      ids_data[low_offset] = item.id;
      scores_data[low_offset] = item.score;
@@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  selected_scores->set_lod(lod);
 }
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
+void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
+                               std::vector<std::vector<Item>> *items) {
  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  int res = 0;
+  auto &high_level = abs_lod[lod_level_];
-  for (size_t offset = 0; offset < items->size(); offset++) {
+  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    auto prefix_id = pre_ids_data[offset];
+    size_t src_prefix_start = high_level[src_idx];
-    if (prefix_id == end_id_) {
+    size_t src_prefix_end = high_level[src_idx + 1];
-      items->at(offset).clear();
+    bool finish_flag = true;
-    } else {
+    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      res++;
+      for (auto &item : items->at(offset)) {
+        if (item.id != static_cast<size_t>(end_id_) ||
+            pre_ids_data[offset] != end_id_) {
+          finish_flag = false;
+          break;
+        }
+      }
+      if (!finish_flag) break;
+    }
+    if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                        // prune this beam
+      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
+        items->at(offset).clear();
    }
  }
-  return res;
 }
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
@@ -115,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
  return result;
 }
-std::vector<std::vector<BeamSearch::Item>>
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
-BeamSearch::SelectTopBeamSizeItems() {
+    const framework::LoDTensor &pre_ids,
+    const framework::LoDTensor &pre_scores) {
  std::vector<std::vector<Item>> result;
  std::vector<Item> items;
  // for each source sentence, select the top beam_size items across all
  // candidate sets.
-  while (NextItemSet(&items)) {
+  while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
+    std::nth_element(
-                     std::end(items), [](const Item &a, const Item &b) {
+        std::begin(items), std::begin(items) + beam_size_, std::end(items),
-                       // TODO(superjom) make score's comparation customizable.
+        [](const Item &a, const Item &b) { return a.score > b.score; });
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
    // prune the top beam_size items.
    if (items.size() > beam_size_) {
      items.resize(beam_size_);
@@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() {
 }
 // the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
+                             const framework::LoDTensor &pre_scores,
+                             std::vector<BeamSearch::Item> *items) {
  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
    return false;
  }
@@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
    instance_dim *= ids.dims()[i];
  }
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto *pre_scores_data = pre_scores.data<float>();
  items->clear();
  items->reserve(framework::product(ids.dims()));
  for (size_t offset = abs_lod[lod_level_][sent_offset_];
       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
+    auto pre_id = pre_ids_data[offset];
-      const size_t dim_offset = offset * instance_dim + d;
+    auto pre_score = pre_scores_data[offset];
-      items->emplace_back(offset, ids_data[dim_offset],
+    if (pre_id == end_id_) {
-                          scores_data[dim_offset]);
+      // Allocate all probability mass to eos_id for finished branchs and the
+      // other candidate ids can be ignored.
+      items->emplace_back(offset, end_id_, pre_score);
+    } else {
+      for (size_t d = 0; d < instance_dim; d++) {
+        const size_t dim_offset = offset * instance_dim + d;
+        items->emplace_back(offset, ids_data[dim_offset],
+                            scores_data[dim_offset]);
+      }
    }
  }
@@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
+    AddInput("pre_ids",
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
+             "(LoDTensor) The LoDTensor containing the selected ids at the "
+             "previous step. It should be a tensor with shape (batch_size, 1) "
+             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
+             "thefirst step.");
+    AddInput("pre_scores",
+             "(LoDTensor) The LoDTensor containing the accumulated "
+             "scores corresponding to the selected ids at the previous step.");
+    AddInput("ids",
+             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
+             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "be beam_size.");
    AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
+             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "corresponding to Input(ids) and its shape is the same as the "
+             "shape of Input(ids).");
    AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
+              "A LodTensor that stores the IDs selected by beam search.");
-    AddOutput(
+    AddOutput("selected_scores",
-        "selected_scores",
+              "A LoDTensor containing the accumulated scores corresponding to "
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+              "Output(selected_ids).");
    // Attributes stored in AttributeMap
    AddAttr<int>("level", "the level of LoDTensor");
@@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("end_id",
                 "the token id which indicates the end of a sequence");
-    AddComment(
+    AddComment(R"DOC(
-        "This is a beam search operator that help to generate sequences.");
+This operator does the search in beams for one time step. 
+Specifically, it selects the top-K candidate word ids of current step from
+Input(ids) according to their Input(scores) for all source sentences,
+where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
+from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
+are the output of beam_search at previous step, they are needed for special use
+to handle ended candidate translations. The paths linking prefixes and selected
+candidates are organized and reserved in lod.
+Note that the Input(scores) passed in should be accumulated scores, and
+length penalty should be done with extra operators before calculating the
+accumulated scores if needed, also suggest finding top-K before it and
+using the top-K candidates following.
+)DOC");
  }
 };
@@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
  void operator()(const framework::OpDesc &op_desc,
                  framework::BlockDesc *block) const override {
    for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
+      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
    }
    for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
    }
  }
 };

--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -132,6 +132,7 @@ class BeamSearch {
   * that means no candidates is provided, and the task will stop running.
   */
  void operator()(const framework::LoDTensor& pre_ids,
+                  const framework::LoDTensor& pre_scores,
                  framework::LoDTensor* selected_ids,
                  framework::LoDTensor* selected_scores);
  /*
@@ -153,14 +154,16 @@ class BeamSearch {
 protected:
  /*
-   * Delete all the records that follows the end token.
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
   */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+  void PruneEndBeams(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
+                     std::vector<std::vector<Item>>* items);
  /*
   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
+   * NOTE low performance.
   */
  std::vector<std::vector<Item>> ToMap(
      const std::vector<std::vector<Item>>& inputs, size_t element_num);
@@ -168,12 +171,16 @@ class BeamSearch {
  /*
   * For each source, select top beam_size records.
   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor& pre_ids,
+      const framework::LoDTensor& pre_scores);
  /*
   * Get the items of next source sequence, return false if no remaining items.
   */
-  bool NextItemSet(std::vector<Item>* items);
+  bool NextItemSet(const framework::LoDTensor& pre_ids,
+                   const framework::LoDTensor& pre_scores,
+                   std::vector<Item>* items);
 private:
  size_t beam_size_;
@@ -192,24 +199,25 @@ template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids_var = context.Input<framework::LoDTensor>("ids");
+    auto* ids = context.Input<framework::LoDTensor>("ids");
-    auto* scores_var = context.Input<framework::LoDTensor>("scores");
+    auto* scores = context.Input<framework::LoDTensor>("scores");
-    auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
+    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
+    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
+    PADDLE_ENFORCE_NOT_NULL(ids);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+    PADDLE_ENFORCE_NOT_NULL(scores);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids);
+    PADDLE_ENFORCE_NOT_NULL(pre_scores);
    size_t level = context.Attr<int>("level");
    size_t beam_size = context.Attr<int>("beam_size");
    int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
+    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
-    auto selected_ids_var =
+    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
-        context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores =
-    auto selected_scores_var =
        context.Output<framework::LoDTensor>("selected_scores");
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_ids);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores);
-    alg(*pre_ids_var, selected_ids_var, selected_scores_var);
+    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,7 +30,7 @@ using std::endl;
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
  LoD lod;
-  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level0({0, 2, 4});
  vector<size_t> level1({0, 1, 2, 3, 4});
  lod.push_back(level0);
  lod.push_back(level1);
@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
  for (int i = 0; i < 4; i++) {
    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
  }
+  LoDTensor pre_scores;
+  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
  LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
+  beamsearch(pre_ids, pre_scores, &sids, &sscores);
  LOG(INFO) << "score: " << sscores << endl;
  ASSERT_EQ(sids.lod(), sscores.lod());
-  vector<int> tids({2, 4, 3, 8});
+  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
  for (int i = 0; i < 4; i++) {
    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
+// conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                  ops::Conv2DTransposeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                     double>);
+// conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                  ops::Conv3DTransposeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                     double>);
+// depthwise conv2d_transpose
+REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -15,25 +15,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
 namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(
+// conv2d
-    conv2d_transpose,
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
-    conv2d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+// conv3d
-                                     double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
-REGISTER_OP_CUDA_KERNEL(
+                        ops::GemmConvTransposeKernel<CUDA, double>);
-    conv3d_transpose,
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
+// depthwise conv2d
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
-                                     float>,
+                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
-                                     double>);
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
    }
  }
 };
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1);
+    }
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
+                           output);
+  }
+};
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (input_grad) {
+      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+                    input_grad);
+    }
+    if (filter_grad) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  }
 };
+template <class T>
+bool DistPairDescend(std::tuple<int, int, T> pair1,
+                     std::tuple<int, int, T> pair2) {
+  return std::get<2>(pair1) > std::get<2>(pair2);
+}
 template <typename T>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
 public:
@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  // The match_dist must be initialized to 0 at first.
  void BipartiteMatch(const Tensor& dist, int* match_indices,
                      T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
    int64_t row = dist.dims()[0];
    int64_t col = dist.dims()[1];
    auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
+    // Test result: When row==130 the speed of these two methods almost the same
-    for (int i = 0; i < row; ++i) {
+    if (row >= 130) {
-      row_pool.push_back(i);
+      std::vector<std::tuple<int, int, T>> match_pair;
-    }
-    while (row_pool.size() > 0) {
+      for (int64_t i = 0; i < row; ++i) {
-      int max_idx = -1;
+        for (int64_t j = 0; j < col; ++j) {
-      int max_row_idx = -1;
+          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
        }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
+      }
-          int m = row_pool[k];
+      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
-          // distance is 0 between m-th row and j-th column
+      std::vector<int> row_indices(row, -1);
-          if (dist_data[m * col + j] < kEPS) {
+      int64_t idx = 0;
+      for (int64_t k = 0; k < row * col; ++k) {
+        int64_t i = std::get<0>(match_pair[k]);
+        int64_t j = std::get<1>(match_pair[k]);
+        T dist = std::get<2>(match_pair[k]);
+        if (idx >= row) {
+          break;
+        }
+        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
+          match_indices[j] = i;
+          row_indices[i] = j;
+          match_dist[j] = dist;
+          idx += 1;
+        }
+      }
+    } else {
+      constexpr T kEPS = static_cast<T>(1e-6);
+      std::vector<int> row_pool;
+      for (int i = 0; i < row; ++i) {
+        row_pool.push_back(i);
+      }
+      while (row_pool.size() > 0) {
+        int max_idx = -1;
+        int max_row_idx = -1;
+        T max_dist = -1;
+        for (int64_t j = 0; j < col; ++j) {
+          if (match_indices[j] != -1) {
            continue;
          }
-          if (dist_data[m * col + j] > max_dist) {
+          for (size_t k = 0; k < row_pool.size(); ++k) {
-            max_idx = j;
+            int m = row_pool[k];
-            max_row_idx = m;
+            // distance is 0 between m-th row and j-th column
-            max_dist = dist_data[m * col + j];
+            if (dist_data[m * col + j] < kEPS) {
+              continue;
+            }
+            if (dist_data[m * col + j] > max_dist) {
+              max_idx = j;
+              max_row_idx = m;
+              max_dist = dist_data[m * col + j];
+            }
          }
        }
-      }
+        if (max_idx == -1) {
-      if (max_idx == -1) {
+          // Cannot find good match.
-        // Cannot find good match.
+          break;
-        break;
+        } else {
-      } else {
+          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+          match_indices[max_idx] = max_row_idx;
-        match_indices[max_idx] = max_row_idx;
+          match_dist[max_idx] = max_dist;
-        match_dist[max_idx] = max_dist;
+          // Erase the row index.
-        // Erase the row index.
+          row_pool.erase(
-        row_pool.erase(
+              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -26,8 +26,12 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
                   "Input(X) of FillZerosLikeOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
  }
 };
@@ -39,7 +43,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 FillZerosLike Operator.
-Fill up a variable with zeros.
+Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
 The output will have the same size as the input.
 )DOC");

--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -23,12 +24,29 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto var = context.InputVar("X");
-    out->mutable_data<T>(context.GetPlace());
+    if (var->IsType<framework::LoDTensor>()) {
+      auto& input = *context.Input<framework::LoDTensor>("X");
-    math::SetConstant<DeviceContext, T> setter;
+      auto& output = *context.Output<framework::LoDTensor>("Out");
-    setter(context.template device_context<DeviceContext>(), out,
+      output.Resize(input.dims());
-           static_cast<T>(0));
+      output.set_lod(input.lod());
+      output.mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> setter;
+      setter(context.template device_context<DeviceContext>(), &(output),
+             static_cast<T>(0));
+    } else if (var->IsType<framework::LoDTensorArray>()) {
+      auto& input = *context.Input<framework::LoDTensorArray>("X");
+      auto& output = *context.Output<framework::LoDTensorArray>("Out");
+      output.resize(input.size());
+      for (auto i = 0; i < input.size(); i++) {
+        output[i].Resize(input[i].dims());
+        output[i].set_lod(input[i].lod());
+        output[i].mutable_data<T>(context.GetPlace());
+        math::SetConstant<DeviceContext, T> setter;
+        setter(context.template device_context<DeviceContext>(), &(output[i]),
+               static_cast<T>(0));
+      }
+    }
  }
 };

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent

--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -88,24 +88,29 @@ class BlockingQueue {
    receive_cv_.notify_all();
  }
-  bool IsClosed() {
+  bool IsClosed() const {
    std::lock_guard<std::mutex> lock(mutex_);
    return closed_;
  }
-  size_t Cap() {
+  size_t Cap() const {
    std::lock_guard<std::mutex> lock(mutex_);
    return capacity_;
  }
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
 private:
  size_t capacity_;
  bool closed_;
  std::deque<T> queue_;
-  std::mutex mutex_;
+  mutable std::mutex mutex_;
-  std::condition_variable receive_cv_;
+  mutable std::condition_variable receive_cv_;
-  std::condition_variable send_cv_;
+  mutable std::condition_variable send_cv_;
 };
 }  // namespace reader
 }  // namespace operators

--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class PyReader : public framework::ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    queue_ = queue;
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  void ReInit() override {}
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(
+        queue_holder_var != nullptr,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+class CreatePyReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+    AddComment(R"DOC(
+			Create PyReader to support LoDTensor data feeding in Python side.
+      )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace reader = ::paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
+                              reader::CreatePyReaderOpMaker);
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class LoDTensorBlockingQueueHolder;
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  LoDTensorBlockingQueue(size_t capacity,
+                         const std::vector<framework::DDim>& dims)
+      : queue_(capacity), dims_(dims) {}
+ public:
+  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(lod_tensor_vec);
+  }
+  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(std::move(lod_tensor_vec));
+  }
+  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    bool success = queue_.Receive(&lod_tensor_vec);
+    if (ok != nullptr) *ok = success;
+    return lod_tensor_vec;
+  }
+  inline size_t Cap() const { return queue_.Cap(); }
+  inline size_t Size() const { return queue_.Size(); }
+  inline void Close() { return queue_.Close(); }
+  inline bool IsClosed() const { return queue_.IsClosed(); }
+ private:
+  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
+                   "Expect input size is %d but found %s", dims_.size(),
+                   lod_tensor_vec.size());
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      const auto& in_dims = framework::slice_ddim(
+          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
+      const auto& expect_dims =
+          framework::slice_ddim(dims_[i], 1, dims_[i].size());
+      PADDLE_ENFORCE(in_dims == expect_dims,
+                     "Dims of the %d-th input tensor do not match", i);
+    }
+  }
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+class LoDTensorBlockingQueueHolder {
+ public:
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(
+        queue_ == nullptr,
+        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
+    return queue_;
+  }
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
      LoDTensor* dx) {
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, dx, static_cast<T>(0));
    int dout_offset = 0;
    for (size_t i = 1; i < ref_lod.size(); ++i) {
      int repeat_num = ref_lod[i] - ref_lod[i - 1];
@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
    g_x->mutable_data<T>(context.GetPlace());
    g_x->set_lod(x->lod());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
    auto& y_lod = y->lod();
    if (ref_level == -1) ref_level = y_lod.size() - 1;
    // just copy the gradient

--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
               << " to " << offset + 1;
      out->resize(offset + 1);
    }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->set_lod(x_tensor.lod());
    if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto &dev_ctx = *pool.Get(place);
      TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
    } else {
      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                  "nothing has been written to output array["

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
+    VLOG(4) << "TensorRTEngineKernel executing";
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
  *var = *desc.Proto();
 }
-template <typename T>
-void SetAttr(framework::proto::OpDesc* op, const std::string& name,
-             const T& data);
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
-                          const std::string& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
-                  const int& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
-                      const int64_t& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-template <>
-void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
-                                       const std::string& name,
-                                       const std::vector<std::string>& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
-  for (const auto& s : data) {
-    attr->add_strings(s.c_str());
-  }
-}
 }  // namespace
+using inference::analysis::SetAttr;
 TEST(TensorRTEngineOp, manual) {
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -297,6 +298,37 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<framework::ReaderHolder>(m, "Reader", "")
      .def("reset", &framework::ReaderHolder::ReInit);
+  using LoDTensorBlockingQueue =
+      ::paddle::operators::reader::LoDTensorBlockingQueue;
+  using LoDTensorBlockingQueueHolder =
+      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+      .def("push",
+           [](LoDTensorBlockingQueue &self,
+              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             pybind11::gil_scoped_release release;
+             return self.Push(lod_tensor_vec);
+           })
+      .def("size", &LoDTensorBlockingQueue::Size)
+      .def("capacity", &LoDTensorBlockingQueue::Cap)
+      .def("close", &LoDTensorBlockingQueue::Close)
+      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
+  m.def("init_lod_tensor_blocking_queue",
+        [](Variable &var, size_t capacity,
+           const std::vector<std::vector<int64_t>> &shapes)
+            -> LoDTensorBlockingQueue * {
+              std::vector<DDim> dims(shapes.size());
+              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                             [](const std::vector<int64_t> &shape) {
+                               return make_ddim(shape);
+                             });
+              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+              holder->InitOnce(capacity, dims);
+              return holder->GetQueue().get();
+            },
+        py::return_value_policy::reference);
  py::class_<Scope>(m, "Scope", "")
      .def("var",
           [](Scope &self, const std::string &name) -> Variable * {
@@ -463,9 +495,11 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_DISTRIBUTE
      .def("complete", &Executor::Complete)
 #endif
-      .def("run",
+      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
+                     int block_id, bool create_local_scope, bool create_vars) {
-               Executor::Run);
+        pybind11::gil_scoped_release release;
+        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+      });
  m.def("init_gflags", framework::InitGflags);
  m.def("init_glog", framework::InitGLOG);
@@ -631,7 +665,12 @@ All parameter, weight, gradient are variables in Paddle.
           &ParallelExecutor::FeedTensorsIntoLocalScopes)
      .def("feed_and_split_tensor_into_local_scopes",
           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", &ParallelExecutor::Run);
+      .def("run", [](ParallelExecutor &self,
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
+        pybind11::gil_scoped_release release;
+        self.Run(fetch_tensors, fetched_var_name);
+      });
  BindRecordIOWriter(&m);
  return m.ptr();

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCPUTensorSetFromArray(
+inline void PyCPUTensorSetFromArray(
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDATensorSetFromArray(
+inline void PyCUDATensorSetFromArray(
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDAPinnedTensorSetFromArray(
+inline void PyCUDAPinnedTensorSetFromArray(
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,6 +106,8 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -133,7 +135,8 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 function abort(){

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -111,7 +111,7 @@ def fetch():
    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 def convert(path):

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,6 +27,7 @@ __all__ = [
    'Variable',
    'Program',
    'Operator',
+    'Parameter',
    'default_startup_program',
    'default_main_program',
    'program_guard',
@@ -1922,7 +1923,7 @@ def program_guard(main_program, startup_program=None):
 def get_var(name, program=None):
    """
    Get a variable by name from the global block of a program.
    Args:
        name(str): name of the variable
        program(Program|None): program object.

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -95,6 +95,7 @@ __all__ = [
    'relu',
    'log',
    'crop',
+    'fill_zeros_like',
 ]
@@ -1993,7 +1994,8 @@ def batch_norm(input,
               name=None,
               moving_mean_name=None,
               moving_variance_name=None,
-               do_model_average_for_mean_and_var=False):
+               do_model_average_for_mean_and_var=False,
+               fuse_with_relu=False):
    """
    **Batch Normalization Layer**
@@ -2036,6 +2038,7 @@ def batch_norm(input,
        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
    Returns:
        Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2121,7 +2124,8 @@ def batch_norm(input,
            "momentum": momentum,
            "epsilon": epsilon,
            "is_test": is_test,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": use_mkldnn,
+            "fuse_with_relu": fuse_with_relu
        })
    return helper.append_activation(batch_norm_out)
@@ -2224,56 +2228,6 @@ def layer_norm(input,
    return helper.append_activation(layer_norm_out)
-def beam_search_decode(ids, scores, name=None):
-    """
-    Beam Search Decode
-    This layers is to pack the output of beam search layer into sentences and
-    associated scores. It is usually called after the beam search layer.
-    Typically, the output of beam search layer is a tensor of selected ids, with
-    a tensor of the score of each id. Beam search layer's output ids, however,
-    are generated directly during the tree search, and they are stacked by each
-    level of the search tree. Thus we need to reorganize them into sentences,
-    based on the score of each id. This layer takes the output of beam search
-    layer as input and repack them into sentences.
-    Args:
-        ids (Variable): The selected ids, output of beam search layer.
-        scores (Variable): The associated scores of the ids, out put of beam
-            search layer.
-        name (str): The name of this layer. It is optional.
-    Returns:
-        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
-        sentence_ids is a tensor with shape [size, length], where size is the
-        beam size of beam search, and length is the length of each sentence.
-        Note that the length of sentences may vary.
-        sentence_scores is a tensor with the same shape as sentence_ids.
-    Examples:
-        .. code-block:: python
-            ids, scores = fluid.layers.beam_search(
-                pre_ids, ids, scores, beam_size, end_id)
-            sentence_ids, sentence_scores = fluid.layers.beam_search_decode(
-                ids, scores)
-    """
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-    return sentence_ids, sentence_scores
 def conv2d_transpose(input,
                     num_filters,
                     output_size=None,
@@ -2384,10 +2338,17 @@ def conv2d_transpose(input,
          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
    """
-    helper = LayerHelper("conv2d_transpose", **locals())
+    input_channel = input.shape[1]
+    op_type = 'conv2d_transpose'
+    if (input_channel == groups and num_filters == input_channel and
+            not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+    helper = LayerHelper(op_type, **locals())
    if not isinstance(input, Variable):
        raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
    padding = utils.convert_to_list(padding, 2, 'padding')
    stride = utils.convert_to_list(stride, 2, 'stride')
@@ -2421,7 +2382,7 @@ def conv2d_transpose(input,
    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type='conv2d_transpose',
+        type=op_type,
        inputs={'Input': [input],
                'Filter': [img_filter]},
        outputs={'Output': pre_bias},
@@ -2677,38 +2638,89 @@ def sequence_expand(x, y, ref_level=-1, name=None):
    return tmp
-def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
+def beam_search(pre_ids,
-    '''
+                pre_scores,
-    **beam search**
+                ids,
+                scores,
-    This function implements the beam search algorithm.
+                beam_size,
+                end_id,
-    Beam search is a classical algorithm for selecting candidate words
+                level=0,
-    in a machine translation task.
+                name=None):
+    """
+    Beam search is a classical algorithm for selecting candidate words in a
+    machine translation task.
    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
    for more details.
+    This layer does the search in beams for one time step. Specifically, it 
+    selects the top-K candidate word ids of current step from :attr:`ids`
+    according to their :attr:`scores` for all source sentences, where K is
+    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
+    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
+    the output of beam_search at previous step, they are needed for special use
+    to handle ended candidate translations.
+    Note that the :attr:`scores` passed in should be accumulated scores, and
+    length penalty should be done with extra operators before calculating the
+    accumulated scores if needed, also suggest finding top-K before it and
+    using the top-K candidates following.
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
    Args:
-        pre_ids (Variable): ids in previous step.
+        pre_ids(Variable): The LodTensor variable which is the output of
-        ids (Variable): a LoDTensor of shape of [None,k]
+            beam_search at previous step. It should be a LodTensor with shape
-        scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
+            :math:`(batch_size, 1)` and lod
-        beam_size (int): beam size for beam search
+            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
-        end_id (int): the token id which indicates the end of a sequence
+            first step.
-        level (int): the level of LoDTensor
+        pre_scores(Variable): The LodTensor variable which is the output of
+            beam_search at previous step.
+        ids(Variable): The LodTensor variable containing the candidates ids.
+            Its shape should be :math:`(batch_size \\times beam_size, K)`,
+            where :math:`K` supposed to be :attr:`beam_size`.
+        scores(Variable): The LodTensor variable containing the accumulated
+            scores corresponding to :attr:`ids` and its shape is the same as
+            the shape of :attr:`ids`.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int, default 0): It can be ignored and mustn't change currently.
+            It means the source level of lod, which is explained as following.
+            The lod level of :attr:`ids` should be 2. The first level is source
+            level which describes how many prefixes (branchs) for each source
+            sentece (beam), and the second level is sentence level which
+            describes how these candidates belong to the prefix. The paths
+            linking prefixes and selected candidates are organized and reserved
+            in lod.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
    Returns:
-        tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
+        Variable: The LodTensor pair containing the selected ids and the \
+            corresponding scores.
    Examples:
        .. code-block:: python
-             # current_score is a Tensor of shape (num_batch_size, embed_size), which
+            # Suppose `probs` contains predicted results from the computation
-             # consists score of each candidate word.
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
-             topk_scores, topk_indices = pd.topk(current_score, k=50)
+            # at previous step.
-             selected_ids, selected_scores = pd.beam_search(
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
-                 pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            accu_scores = layers.elementwise_add(
-    '''
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
    helper = LayerHelper('beam_search', **locals())
    score_type = scores.dtype
    id_type = ids.dtype
@@ -2720,6 +2732,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
        type='beam_search',
        inputs={
            'pre_ids': pre_ids,
+            'pre_scores': pre_scores,
            'ids': ids,
            'scores': scores,
        },
@@ -2737,6 +2750,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
    return selected_ids, selected_scores
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    Beam Search Decode Layer. This layer constructs the full hypotheses for
+    each source sentence by walking back along the LoDTensorArray :attr:`ids`
+    whose lods can be used to restore the path in the beam search tree.
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
+    Args:
+        ids(Variable): The LodTensorArray variable containing the selected ids
+            of all steps.
+        scores(Variable): The LodTensorArray variable containing the selected
+            scores of all steps.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The LodTensor pair containing the generated id sequences \
+            and the corresponding scores. The shapes and lods of the two \
+            LodTensor are same. The lod level is 2 and the two levels \
+            separately indicate how many hypotheses each source sentence has \
+            and how many ids each hypothesis has.
+    Examples:
+        .. code-block:: python
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            finished_ids, finished_scores = layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
+    return sentence_ids, sentence_scores
 def lstm_unit(x_t,
              hidden_t_prev,
              cell_t_prev,
@@ -5015,12 +5078,12 @@ def mean_iou(input, label, num_classes):
    out_correct = helper.create_tmp_variable(dtype='int32')
    helper.append_op(
        type="mean_iou",
-        inputs={"predictions": input,
+        inputs={"Predictions": input,
-                "labels": label},
+                "Labels": label},
        outputs={
-            "out_mean_iou": out_mean_iou,
+            "OutMeanIou": out_mean_iou,
-            "out_wrong": out_wrong,
+            "OutWrong": out_wrong,
-            "out_correct": out_correct
+            "OutCorrect": out_correct
        },
        attrs={"num_classes": num_classes})
    return out_mean_iou, out_wrong, out_correct
@@ -5122,3 +5185,40 @@ def crop(x, shape=None, offsets=None, name=None):
        outputs={'Out': out},
        attrs=None if len(attrs) == 0 else attrs)
    return out
+def fill_zeros_like(x):
+    """
+    This layer takes an input and outputs a variable that has the same structure as
+    the input and with all the element values as zero. The variable can be a Tensor
+    or TensorArray.
+    .. code-block:: text
+       Given
+          X = [[0, 1, 2, 0],
+               [0, 3, 4, 0],
+               [0, 0, 0, 0]],
+       output is:
+          Out = [[0, 0, 0, 0],
+                 [0, 0, 0, 0],
+                 [0, 0, 0, 0]].
+    Args:
+        x (Variable): The input variable, which could be a tensor or tensor array
+    Returns:
+        Variable: The zero-filled variable, which has the same type and shape as
+                  the input variable.
+    Examples:
+        .. code-block:: python
+            y = fluid.layers.fill_zeros_like(x)
+    """
+    helper = LayerHelper('fill_zeros_like', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    return out
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,15 +18,16 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
-def create_lod_tensor(data, lod, place):
+def create_lod_tensor(data, recursive_seq_lens, place):
    """
    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
    Create a lod tensor by doing the following:
-    1. Check that the length-based input lod is valid.
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
-    2. Convert the length-based lod to a offset-based LoD.
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
    3. Copy the data from a numpy array, a list or a existing lod tensor to
       CPU or GPU device (based on input place).
@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place):
        Suppose we want LoDTensor to hold data for sequences of word, where each
        word is represented by an integer. If we want to create a LoDTensor to
-        represent two  sentences, one of 2 words, and one of 3 words.
+        represent two sentences, one of 2 words, and one of 3 words.
        Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based input lod [[2, 3]] will be converted to
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
-        offset-based lod [[0, 2, 5]] inside the function call.
+        offset-based LoD [[0, 2, 5]] inside the function call.
    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
    regarding LoD.
    Args:
        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be  copied.
+            list holding the data to be copied.
-        lod(list): a list of lists indicating the length-based LoD info
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
-            specified by the user.
+            info specified by the user.
        place(Place): CPU or GPU place indicating where the data in the new
            LoDTensor will be stored.
    Returns:
-        A fluid LoDTensor object with tensor data and lod info.
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
    """
    if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), lod, place)
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
    elif isinstance(data, list):
        # When input data is a list, it only deal with the case where the base element 
        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
        # of words or other indexes in the sequence. 
-        new_lod = []
+        new_recursive_seq_lens = []
        for seq in data:
-            new_lod.append(len(seq))
+            new_recursive_seq_lens.append(len(seq))
-        assert [new_lod] == lod, "data and lod do not match"
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
        flattened_data = np.concatenate(data, axis=0).astype("int64")
        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        return create_lod_tensor(flattened_data, lod, place)
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
    elif isinstance(data, np.ndarray):
        tensor = core.LoDTensor()
        tensor.set(data, place)
-        tensor.set_recursive_sequence_lengths(lod)
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
        assert tensor.has_valid_recursive_sequence_lengths(
        ), "the provided lod info is invalid"
        return tensor
@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place):
            "data should be either a LoDTensor, a Numpy array or a list")
-def create_random_int_lodtensor(lod, base_shape, place, low, high):
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
    """
    Create a LoDTensor containing random integers.
@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
    The function does the following:
    1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`lod` input and the shape of the basic element in
+       :code:`recursive_seq_lens` input and the shape of the basic element in
       :code:`base_shape`.
    2. Create a numpy array of this shape.
@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
    Suppose we want LoDTensor to hold data for sequences of word, where each
    word is represented by an integer. If we want to create a LoDTensor to
    represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
-    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
    Args:
-        lod(list): a list of lists indicating the length-based LoD info
+        recursive_seq_lens(list): a list of lists indicating the length-based 
-            specified by the user.
+            level of detail info specified by the user.
        base_shape(list): the shape of the basic element to be held by the
            LoDTensor.
        place(Place): CPU or GPU place indicating where the data in the new
@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
        high(int): the upper bound of the random integers.
    Returns:
-        A fluid LoDTensor object with tensor data and lod info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
    """
    assert isinstance(base_shape, list), "base_shape should be a list"
    # append the total number of basic elements to the front of its shape
-    overall_shape = [sum(lod[-1])] + base_shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
    # the range of integer data elements is [low, high]    
    data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, lod, place)
+    return create_lod_tensor(data, recursive_seq_lens, place)
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1113,7 +1113,6 @@ class ModelAverage(Optimizer):
    Args:
        average_window_rate: The rate of average window.
-        params_grads: A list of parameter-grad variable pairs.
        min_average_window: The minimum size of average window.
        max_average_window: The maximum size of average window.
@@ -1122,8 +1121,8 @@ class ModelAverage(Optimizer):
      .. code-block:: python
        optimizer = fluid.optimizer.Momentum()
-        _, params_grads = optimizer.minimize(cost)
+        optimizer.minimize(cost)
-        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+        model_average = fluid.optimizer.ModelAverage(0.15,
                                                min_average_window=10000,
                                                max_average_window=20000)
        for pass_id in range(args.pass_num):
@@ -1137,7 +1136,6 @@ class ModelAverage(Optimizer):
    def __init__(self,
                 average_window_rate,
-                 params_grads=None,
                 min_average_window=10000,
                 max_average_window=10000,
                 **kwargs):
@@ -1146,21 +1144,16 @@ class ModelAverage(Optimizer):
        self.min_average_window = min_average_window
        self.max_average_window = max_average_window
-        self.params_grads = [] if params_grads is None else params_grads
+        self.params_grads = []
-        params = {}
-        for param, grad in self.params_grads:
-            if param.do_model_average != False:
-                params[param.name] = (param, grad)
        for param in framework.default_main_program().global_block(
        ).all_parameters():
-            if param.name not in params and param.do_model_average != False:
+            if param.do_model_average != False:
                grad = param.block.create_var(
                    name=unique_name.generate(".".join([param.name, 'tmp'])),
                    dtype=param.dtype,
                    persistable=False,
                    stop_gradient=True)
-                params[param.name] = (param, grad)
+                self.params_grads.append((param, grad))
-        self.params_grads = params.values()
        for param, grad in self.params_grads:
            self._append_average_accumulate_op(param)

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -160,7 +160,7 @@ class ParallelExecutor(object):
            build_strategy, num_trainers, trainer_id)
        self.scope = scope
-    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=False):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
        """
        Run a parallel executor with fetch_list.
@@ -197,7 +197,7 @@ class ParallelExecutor(object):
            feed_dict: Alias for feed parameter, for backward compatibility.
                This parameter has been deprecated. Default None.
            return_numpy(bool): Whether converts the fetched tensor to numpy.
-                Default: False.
+                Default: True.
        Returns:
            List: The fetched result list.

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py